Comments (2)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/math/gemm.h"
#include "operators/math/math_function.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define m 1024
#define n 1024
#define k 1024
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
Tensor aa, bb, cc;
auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n});
auto ccptr = cc.mutable_data<float>({m, n});
for (int i = 0; i < m * k; ++i) {
aaptr[i] = 2;
}
for (int i = 0; i < k * n; ++i) {
bbptr[i] = 2;
}
for (int i = 0; i < m * n; ++i) {
ccptr[i] = 2;
}
Tensor aa_int8, bb_int8, cc_int32, cc_int8;
auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
int32_t* bias_data_col = new int32_t[m];
int32_t* bias_data_row = new int32_t[n];
for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < k * n; ++i) {
bbptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < m * n; ++i) {
ccptr_int32[i] = static_cast<int32_t>(2);
}
for (int i = 0; i < m; ++i) {
bias_data_col[i] = 2;
}
for (int i = 0; i < n; ++i) {
bias_data_row[i] = 2;
}
// float
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<float, float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
}
auto time_start0 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<float, float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
}
auto time_end0 = time();
std::cout << "float gemm cost :" << time_diff(time_start0, time_end0) / 10
<< "ms\n";
// int8_t without bias
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0));
}
auto time_start1 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0));
}
auto time_end1 = time();
std::cout << "int8_t gemm cost :" << time_diff(time_start1, time_end1) / 10
<< "ms\n";
// int8_t with bias, column element wise add
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false);
}
auto time_start2 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false);
}
auto time_end2 = time();
std::cout << "int8_t gemm_with_bias(column add) cost :"
<< time_diff(time_start2, time_end2) / 10 << "ms\n";
// int8_t with bias, row element wise add
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true);
}
auto time_start3 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true);
}
auto time_end3 = time();
std::cout << "int8_t gemm_with_bias(row add) cost :"
<< time_diff(time_start3, time_end3) / 10 << "ms\n";
// int8_t with bias&relu
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false);
}
auto time_start4 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false);
}
auto time_end4 = time();
std::cout << "int8_t gemm_with_bias_relu cost :"
<< time_diff(time_start4, time_end4) / 10 << "ms\n";
delete[] bias_data_row;
delete[] bias_data_col;
return 0;
}
from deep-learning-operator.
Segmentation Fault与堆内存
跑m=1, n=1~60000, k=1~60000
,其中m
和n
以1000递增的gemv_trans_mx1
时,当跑到m=1, n=8000, k=30000
时出现Segmentation Fault卡主,CTRL+C
中断程序后,立马logcat
查询当前日志(根据这份adb logcat总结,发现两个常用命令:adb logcat -v time
、adb logcat -c
),在不打印的地方再次中断并用date
打印当前时间以确认打印出的日志是否符合预期。
如下是出现捕捉到的Segmentation Fault后的日志信息,其中可以看到执行的程序名test-gemm-accuracy
,再就是对应的Build fingerprint
部分,有写道Cause: null pointer dereference
,导致的原因是空指针,因为代码中没对malloc
后的指针做判断,猜测原因是申请到的堆内存失败,返回了nullptr
。
11-04 17:28:42.889 27339 27339 F libc : Fatal signal 11 (SIGSEGV), code 1, fault addr 0x0 in tid 27339 (test-gemm-accur)
11-04 17:28:42.915 27342 27342 W crash_dump32: type=1400 audit(0.0:2846): avc: denied { search } for name="tmp" dev="sda17" ino=2097154 scontext=u:r:crash_dump:s0 tcontext=u:object_r:shell_data_file:s0 tc
lass=dir permissive=0
11-04 17:28:42.915 27342 27342 W crash_dump32: type=1400 audit(0.0:2847): avc: denied { search } for name="tmp" dev="sda17" ino=2097154 scontext=u:r:crash_dump:s0 tcontext=u:object_r:shell_data_file:s0 tc
lass=dir permissive=0
11-04 17:28:42.915 27342 27342 W crash_dump32: type=1400 audit(0.0:2848): avc: denied { search } for name="bin" dev="sda17" ino=2097510 scontext=u:r:crash_dump:s0 tcontext=u:object_r:shell_data_file:s0 tc
lass=dir permissive=0
11-04 17:28:42.925 27342 27342 I crash_dump32: obtaining output fd from tombstoned
11-04 17:28:42.925 959 959 I /system/bin/tombstoned: received crash request for pid 27339
11-04 17:28:42.927 27342 27342 I crash_dump32: performing dump of process 27339 (target tid = 27339)
11-04 17:28:42.915 27342 27342 W crash_dump32: type=1400 audit(0.0:2849): avc: denied { search } for name="tmp" dev="sda17" ino=2097154 scontext=u:r:crash_dump:s0 tcontext=u:object_r:shell_data_file:s0 tc
lass=dir permissive=0
11-04 17:28:42.928 27342 27342 F DEBUG : *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
11-04 17:28:42.928 27342 27342 F DEBUG : Build fingerprint: 'Xiaomi/chiron/chiron:8.0.0/OPR1.170623.027/V10.0.1.0.ODECNFH:user/release-keys'
11-04 17:28:42.928 27342 27342 F DEBUG : Revision: '0'
11-04 17:28:42.928 27342 27342 F DEBUG : ABI: 'arm'
11-04 17:28:42.928 27342 27342 F DEBUG : pid: 27339, tid: 27339, name: test-gemm-accur >>> ./test-gemm-accuracy <<<
11-04 17:28:42.928 27342 27342 F DEBUG : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x0
11-04 17:28:42.929 27342 27342 F DEBUG : Cause: null pointer dereference
11-04 17:28:42.929 27342 27342 F DEBUG : r0 0000fa00 r1 00000000 r2 00000000 r3 00000004
11-04 17:28:42.929 27342 27342 F DEBUG : r4 ffafa010 r5 00003e80 r6 ef200040 r7 000493e0
11-04 17:28:42.929 27342 27342 F DEBUG : r8 00000000 r9 00000000 sl 000493dd fp 00000001
11-04 17:28:42.929 27342 27342 F DEBUG : ip efa47440 sp ffaf9fc0 lr 00000000 pc ef66c152 cpsr 00010030
11-04 17:28:42.929 27342 27342 F DEBUG :
11-04 17:28:42.929 27342 27342 F DEBUG : backtrace:
11-04 17:28:42.929 27342 27342 F DEBUG : #00 pc 00226152 /data/local/tmp/bin/libpaddle-mobile.so
11-04 17:28:42.929 27342 27342 F DEBUG : #01 pc 002923df /data/local/tmp/bin/libpaddle-mobile.so
11-04 17:28:42.937 27342 27342 E crash_dump32: cannot open libmiuindbg.so: No such file or directory
11-04 17:28:42.938 1557 2753 W NativeCrashListener: Couldn't find ProcessRecord for pid 27339
11-04 17:28:42.939 959 959 E /system/bin/tombstoned: Tombstone written to: /data/tombstones//tombstone_00
11-04 17:28:49.694 27344 27344 E QCALOG : [mlid] Failed to load libminksocket
用free -m
(或free -g
分别是以MB和GB为单位)终端里查询手机当前的内存情况。
chiron:/data/local/tmp/bin $ free -m
total used free shared buffers
Mem: 5724 5023 700 15 236
-/+ buffers/cache: 4786 937
Swap: 2559 44 2515
chiron:/data/local/tmp/bin $ free -g
total used free shared buffers
Mem: 5 5 0 0 0
-/+ buffers/cache: 4 0
Swap: 2 0 2
根据内存查看命令free解读:
- 系统层面分析,
total = used + free
; - 程序层面分析,
程序预留的内存 = buffers + cached
,其中:- shared:多个进程共享的内存总和,当前废弃不用;
- buffers:缓存内存数;
- cached: 缓存内存数;
举个小例子
Linux 系统会把物理内存占用一部分,即used的大小,在used中会分一部分,供所有程序实际使用。
我买个5亩地,我圈起来拿出1亩地盖房子,即系统实际使用的- buffers/cache
。在房子外的周边,系统预留出一部分当菜园,杂物室,buffers和cached就是这些除房子之外的菜园所占用地了。从外界看来,我占用的就是5亩地。但实际用到的才1亩地的住房+配房的面积。
所以得出来:
我实际占用的地 = 我盖主房的地方(- buffers/cache)= 院子里的地(used)-菜园占地 (buffers+cached)
;buffers/cached
可以分为两部分+ buffers/cached
和- buffers/cached
。总的物理内存=|+ buffers/cached|+|- buffers/cached|
;- buffers/cached
:程序角度上看已经使用的内存数,这才是程序实实在在用掉的内存数。+ buffers/cached
:程序角度上看未使用、可用的内存数。
Segmentation Fault 时的矩阵、向量规模(每个float占32bit,每个字节是8bit,即4字节):
(16000*29000+29000+16000)*4 = 1,856,180,000B = 1.856GB
(8000+30000+30000*8000)*4 = 960,152,000
from deep-learning-operator.
Related Issues (3)
- neon gemv_trans_mx1性能 HOT 1
- ncnn benchmark HOT 1
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from deep-learning-operator.