Skip to content

Commit 63e36a3

Browse files
committed
Support fp32 bp
1 parent 00fece7 commit 63e36a3

File tree

62 files changed

+5682
-6
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+5682
-6
lines changed

TinyEngine/include/nnfunctions.h

+1,037
Large diffs are not rendered by default.

TinyEngine/include/nnfunctions_fp.h

+997
Large diffs are not rendered by default.

TinyEngine/include/tinyengine_function.h

+9-6
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,27 @@
11
/* ----------------------------------------------------------------------
2-
* Project: TinyEngine
2+
* Project: Tiny Training Engine, MCUNetV3
33
* Title: tinyengine_function.h
44
*
55
* Reference papers:
66
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
77
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8-
* - MCUNetV3: On-Device Training Under 256KB Memory, arXiv:2206.15472
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
99
* Contact authors:
10-
* - Wei-Ming Chen, [email protected]
1110
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
1212
* - Ji Lin, [email protected]
1313
* - Ligeng Zhu, [email protected]
1414
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
1516
*
1617
* Target ISA: ARMv7E-M
1718
* -------------------------------------------------------------------- */
1819

1920
#include <stdint.h>
2021
#include <stdbool.h>
22+
#include <stdlib.h>
23+
#include <math.h>
24+
2125
typedef int8_t q7_t;
2226
typedef uint8_t q8_t;
2327
typedef int16_t q15_t;
@@ -146,9 +150,6 @@ tinyengine_status add_fpreq_bitmask(int size, const int8_t* input1_data, const f
146150
const int8_t* input2_data, const float input2_scale, const float input2_zero, const float output_scale,
147151
const float zero_y, int8_t* output_data, int8_t* output_mask);
148152

149-
tinyengine_status where_int8(const bool* inMask, const uint16_t size, signed char* input1_data,
150-
const char* input2_data, char* output_data);
151-
152153
tinyengine_status convolve_1x1_s8_fpreq_mask_partialCH(const q7_t *input,
153154
const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch,
154155
const q7_t *kernel_sram, const q7_t *kernel_flash, const uint16_t first_k_channel, const int32_t *bias, const float *scales,
@@ -157,5 +158,7 @@ tinyengine_status convolve_1x1_s8_fpreq_mask_partialCH(const q7_t *input,
157158
q7_t *output, q7_t *mask, const uint16_t output_x, const uint16_t output_y,
158159
const uint16_t output_ch, q15_t *runtime_buf);
159160

161+
160162
#include "genInclude.h"
161163
#include "fp_requantize_op.h"
164+
//#include "int8_bp_op.h"

TinyEngine/include/tinyengine_function_fp.h

+243
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/* ----------------------------------------------------------------------
2+
* Project: Tiny Training Engine, MCUNetV3
3+
* Title: add_fp.c
4+
*
5+
* Reference papers:
6+
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
7+
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
9+
* Contact authors:
10+
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
12+
* - Ji Lin, [email protected]
13+
* - Ligeng Zhu, [email protected]
14+
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
16+
*
17+
* Target ISA: ARMv7E-M
18+
* -------------------------------------------------------------------- */
19+
20+
#include "tinyengine_function_fp.h"
21+
22+
tinyengine_status_fp add_fp(const uint16_t size, const float* input1_data,
23+
const float* input2_data, float* output_data) {
24+
int i;
25+
26+
for (i = 0; i < size; ++i) {
27+
output_data[i] = input1_data[i] + input2_data[i];
28+
}
29+
30+
/* Return to application */
31+
return STATE_SUCCESS_fp;
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/* ----------------------------------------------------------------------
2+
* Project: Tiny Training Engine, MCUNetV3
3+
* Title: div_fp.c
4+
*
5+
* Reference papers:
6+
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
7+
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
9+
* Contact authors:
10+
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
12+
* - Ji Lin, [email protected]
13+
* - Ligeng Zhu, [email protected]
14+
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
16+
*
17+
* Target ISA: ARMv7E-M
18+
* -------------------------------------------------------------------- */
19+
20+
#include "tinyengine_function_fp.h"
21+
22+
tinyengine_status_fp div_fp(const uint16_t size, const float* input1_data,
23+
const float* input2_data, float* output_data) {
24+
int i;
25+
26+
for (i = 0; i < size; ++i) {
27+
output_data[i] = input1_data[i] / input2_data[i];
28+
}
29+
30+
/* Return to application */
31+
return STATE_SUCCESS_fp;
32+
}

TinyEngine/src/kernels/fp_backward_op/group_conv_fp_kernel4_stride1_pad0.c

+276
Large diffs are not rendered by default.

TinyEngine/src/kernels/fp_backward_op/group_conv_fp_kernel8_stride1_pad0.c

+276
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/* ----------------------------------------------------------------------
2+
* Project: Tiny Training Engine, MCUNetV3
3+
* Title: group_pointwise_conv_fp.c
4+
*
5+
* Reference papers:
6+
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
7+
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
9+
* Contact authors:
10+
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
12+
* - Ji Lin, [email protected]
13+
* - Ligeng Zhu, [email protected]
14+
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
16+
*
17+
* Target ISA: ARMv7E-M
18+
* -------------------------------------------------------------------- */
19+
20+
#include "tinyengine_function_fp.h"
21+
#include "tinyengine_function.h"
22+
#include "nnfunctions_fp.h"
23+
#define DIM_KER_X (1U)
24+
#define DIM_KER_Y (1U)
25+
26+
tinyengine_status_fp group_pointwise_conv_fp_in1x1_out1x1_1row10col_uniweight_int8input_inplace(const int8_t* input_data,
27+
const uint16_t input_height, const uint16_t input_width, const uint16_t input_depth,
28+
const float* filter_data, const float* bias_data,
29+
int8_t* output_weight_data, const uint16_t output_height, const uint16_t output_width, const uint16_t output_depth,
30+
const float output_activation_min, const float output_activation_max,
31+
float* im2col_data, const uint16_t batches, const uint16_t groups,
32+
const float* scales, const float learning_rate) {
33+
(void) input_height;
34+
(void) input_width;
35+
36+
int group;
37+
int output_depth_per_group = output_depth / groups;
38+
39+
for (group = 0; group < groups; group++) {
40+
int i_ch_out;
41+
42+
for (i_ch_out = 0; i_ch_out < output_depth_per_group; i_ch_out+=10) {
43+
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
44+
const float input_0 = (float)input_data[group];
45+
const float filter[10] = {filter_data[i_ch_out], filter_data[i_ch_out + 1], filter_data[i_ch_out + 2], filter_data[i_ch_out + 3], filter_data[i_ch_out + 4],
46+
filter_data[i_ch_out + 5], filter_data[i_ch_out + 6], filter_data[i_ch_out + 7], filter_data[i_ch_out + 8], filter_data[i_ch_out + 9]};
47+
48+
uint16_t col_count_div10 = (output_depth_per_group * DIM_KER_X * DIM_KER_Y) / 10;
49+
50+
while (col_count_div10--) {
51+
// Assume bias_data as NULL
52+
float sum[10] = {};
53+
54+
sum[0] += input_0 * filter[0];
55+
sum[1] += input_0 * filter[1];
56+
sum[2] += input_0 * filter[2];
57+
sum[3] += input_0 * filter[3];
58+
sum[4] += input_0 * filter[4];
59+
sum[5] += input_0 * filter[5];
60+
sum[6] += input_0 * filter[6];
61+
sum[7] += input_0 * filter[7];
62+
sum[8] += input_0 * filter[8];
63+
sum[9] += input_0 * filter[9];
64+
65+
output_weight_data[i_ch_out + group] -= TN_MIN(TN_MAX(sum[0], output_activation_min), output_activation_max) * scales[i_ch_out] * learning_rate;
66+
output_weight_data[(i_ch_out + 1) * groups + group] -= TN_MIN(TN_MAX(sum[1], output_activation_min), output_activation_max) * scales[i_ch_out + 1] * learning_rate;
67+
output_weight_data[(i_ch_out + 2) * groups + group] -= TN_MIN(TN_MAX(sum[2], output_activation_min), output_activation_max) * scales[i_ch_out + 2] * learning_rate;
68+
output_weight_data[(i_ch_out + 3) * groups + group] -= TN_MIN(TN_MAX(sum[3], output_activation_min), output_activation_max) * scales[i_ch_out + 3] * learning_rate;
69+
output_weight_data[(i_ch_out + 4) * groups + group] -= TN_MIN(TN_MAX(sum[4], output_activation_min), output_activation_max) * scales[i_ch_out + 4] * learning_rate;
70+
output_weight_data[(i_ch_out + 5) * groups + group] -= TN_MIN(TN_MAX(sum[5], output_activation_min), output_activation_max) * scales[i_ch_out + 5] * learning_rate;
71+
output_weight_data[(i_ch_out + 6) * groups + group] -= TN_MIN(TN_MAX(sum[6], output_activation_min), output_activation_max) * scales[i_ch_out + 6] * learning_rate;
72+
output_weight_data[(i_ch_out + 7) * groups + group] -= TN_MIN(TN_MAX(sum[7], output_activation_min), output_activation_max) * scales[i_ch_out + 7] * learning_rate;
73+
output_weight_data[(i_ch_out + 8) * groups + group] -= TN_MIN(TN_MAX(sum[8], output_activation_min), output_activation_max) * scales[i_ch_out + 8] * learning_rate;
74+
output_weight_data[(i_ch_out + 9) * groups + group] -= TN_MIN(TN_MAX(sum[9], output_activation_min), output_activation_max) * scales[i_ch_out + 9] * learning_rate;
75+
}
76+
}
77+
}
78+
79+
/* Return to application */
80+
return STATE_SUCCESS_fp;
81+
}
82+
83+
tinyengine_status_fp group_pointwise_conv_fp_in1x1_out1x1_1row10col_uniweight_inplace(const float* input_data,
84+
const uint16_t input_height, const uint16_t input_width, const uint16_t input_depth,
85+
const float* filter_data, const float* bias_data,
86+
int8_t* output_weight_data, const uint16_t output_height, const uint16_t output_width, const uint16_t output_depth,
87+
const float output_activation_min, const float output_activation_max,
88+
float* im2col_data, const uint16_t batches, const uint16_t groups,
89+
const float* scales, const float learning_rate) {
90+
(void) input_height;
91+
(void) input_width;
92+
93+
int group;
94+
int output_depth_per_group = output_depth / groups;
95+
96+
for(group = 0; group < groups; group++) {
97+
int i_ch_out;
98+
99+
for (i_ch_out = 0; i_ch_out < output_depth_per_group; i_ch_out+=10) {
100+
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
101+
const float input_0 = input_data[group];
102+
const float filter[10] = {filter_data[i_ch_out], filter_data[i_ch_out + 1], filter_data[i_ch_out + 2], filter_data[i_ch_out + 3], filter_data[i_ch_out + 4],
103+
filter_data[i_ch_out + 5], filter_data[i_ch_out + 6], filter_data[i_ch_out + 7], filter_data[i_ch_out + 8], filter_data[i_ch_out + 9]};
104+
105+
uint16_t col_count_div10 = (output_depth_per_group * DIM_KER_X * DIM_KER_Y) / 10;
106+
107+
while (col_count_div10--) {
108+
// Assume bias_data as NULL
109+
float sum[10] = {};
110+
111+
sum[0] += input_0 * filter[0];
112+
sum[1] += input_0 * filter[1];
113+
sum[2] += input_0 * filter[2];
114+
sum[3] += input_0 * filter[3];
115+
sum[4] += input_0 * filter[4];
116+
sum[5] += input_0 * filter[5];
117+
sum[6] += input_0 * filter[6];
118+
sum[7] += input_0 * filter[7];
119+
sum[8] += input_0 * filter[8];
120+
sum[9] += input_0 * filter[9];
121+
122+
output_weight_data[i_ch_out + group] -= TN_MIN(TN_MAX(sum[0], output_activation_min), output_activation_max) * scales[i_ch_out] * learning_rate;
123+
output_weight_data[(i_ch_out + 1) * groups + group] -= TN_MIN(TN_MAX(sum[1], output_activation_min), output_activation_max) * scales[i_ch_out + 1] * learning_rate;
124+
output_weight_data[(i_ch_out + 2) * groups + group] -= TN_MIN(TN_MAX(sum[2], output_activation_min), output_activation_max) * scales[i_ch_out + 2] * learning_rate;
125+
output_weight_data[(i_ch_out + 3) * groups + group] -= TN_MIN(TN_MAX(sum[3], output_activation_min), output_activation_max) * scales[i_ch_out + 3] * learning_rate;
126+
output_weight_data[(i_ch_out + 4) * groups + group] -= TN_MIN(TN_MAX(sum[4], output_activation_min), output_activation_max) * scales[i_ch_out + 4] * learning_rate;
127+
output_weight_data[(i_ch_out + 5) * groups + group] -= TN_MIN(TN_MAX(sum[5], output_activation_min), output_activation_max) * scales[i_ch_out + 5] * learning_rate;
128+
output_weight_data[(i_ch_out + 6) * groups + group] -= TN_MIN(TN_MAX(sum[6], output_activation_min), output_activation_max) * scales[i_ch_out + 6] * learning_rate;
129+
output_weight_data[(i_ch_out + 7) * groups + group] -= TN_MIN(TN_MAX(sum[7], output_activation_min), output_activation_max) * scales[i_ch_out + 7] * learning_rate;
130+
output_weight_data[(i_ch_out + 8) * groups + group] -= TN_MIN(TN_MAX(sum[8], output_activation_min), output_activation_max) * scales[i_ch_out + 8] * learning_rate;
131+
output_weight_data[(i_ch_out + 9) * groups + group] -= TN_MIN(TN_MAX(sum[9], output_activation_min), output_activation_max) * scales[i_ch_out + 9] * learning_rate;
132+
}
133+
}
134+
}
135+
136+
/* Return to application */
137+
return STATE_SUCCESS_fp;
138+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/* ----------------------------------------------------------------------
2+
* Project: Tiny Training Engine, MCUNetV3
3+
* Title: less_fp.c
4+
*
5+
* Reference papers:
6+
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
7+
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
9+
* Contact authors:
10+
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
12+
* - Ji Lin, [email protected]
13+
* - Ligeng Zhu, [email protected]
14+
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
16+
*
17+
* Target ISA: ARMv7E-M
18+
* -------------------------------------------------------------------- */
19+
20+
#include "tinyengine_function_fp.h"
21+
22+
tinyengine_status_fp less(const uint16_t size, const float* input1_data,
23+
const float* input2_data, bool* output_data) {
24+
int i;
25+
26+
for (i = 0; i < size; ++i) {
27+
output_data[i] = input1_data[i] < input2_data[i];
28+
}
29+
30+
/* Return to application */
31+
return STATE_SUCCESS_fp;
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* ----------------------------------------------------------------------
2+
* Name: log_softmax_fp.c
3+
* Project: TinyEngine, MCUNetV3
4+
* Contact author: Wei-Chen Wang, [email protected]
5+
* -------------------------------------------------------------------- */
6+
7+
#include "tinyengine_function_fp.h"
8+
#include "tinyengine_function.h"
9+
10+
tinyengine_status_fp LogSoftmax(const float* input_data, const uint16_t input_height, const uint16_t input_width, const uint16_t input_depth,
11+
float* output_data, const uint16_t output_height, const uint16_t output_width, const uint16_t output_depth) {
12+
const int outer_size = input_height * input_width;
13+
const int depth = TN_MIN(input_depth, output_depth);
14+
15+
for (int i = 0; i < outer_size; ++i) {
16+
float max = FLT_MIN;
17+
for (int c = 0; c < depth; ++c) {
18+
max = TN_MAX(max, input_data[i * depth + c]);
19+
}
20+
21+
float sum = 0.f;
22+
for (int c = 0; c < depth; ++c) {
23+
sum += exp(input_data[i * depth + c] - max);
24+
}
25+
26+
const float log_sum = log(sum);
27+
for (int c = 0; c < depth; ++c) {
28+
output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
29+
}
30+
}
31+
32+
/* Return to application */
33+
return STATE_SUCCESS_fp;
34+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/* ----------------------------------------------------------------------
2+
* Project: Tiny Training Engine, MCUNetV3
3+
* Title: mul_fp.c
4+
*
5+
* Reference papers:
6+
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
7+
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
9+
* Contact authors:
10+
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
12+
* - Ji Lin, [email protected]
13+
* - Ligeng Zhu, [email protected]
14+
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
16+
*
17+
* Target ISA: ARMv7E-M
18+
* -------------------------------------------------------------------- */
19+
20+
#include "tinyengine_function_fp.h"
21+
22+
tinyengine_status_fp mul(const uint16_t size, const float* input1_data,
23+
const float* input2_data, float* output_data) {
24+
int i;
25+
26+
for (i = 0; i < size; ++i) {
27+
output_data[i] = input1_data[i] * input2_data[i];
28+
}
29+
30+
/* Return to application */
31+
return STATE_SUCCESS_fp;
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/* ----------------------------------------------------------------------
2+
* Project: Tiny Training Engine, MCUNetV3
3+
* Title: negative_fp.c
4+
*
5+
* Reference papers:
6+
* - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
7+
* - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
8+
* - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
9+
* Contact authors:
10+
* - Wei-Chen Wang, [email protected]
11+
* - Wei-Ming Chen, [email protected]
12+
* - Ji Lin, [email protected]
13+
* - Ligeng Zhu, [email protected]
14+
* - Song Han, [email protected]
15+
* - Chuang Gan, [email protected]
16+
*
17+
* Target ISA: ARMv7E-M
18+
* -------------------------------------------------------------------- */
19+
20+
#include "tinyengine_function_fp.h"
21+
22+
tinyengine_status_fp negative(const uint16_t size, const float* input1_data, bool* output_data) {
23+
int i;
24+
25+
for (i = 0; i < size; ++i) {
26+
output_data[i] = input1_data[i] < 0;
27+
}
28+
29+
/* Return to application */
30+
return STATE_SUCCESS_fp;
31+
}

0 commit comments

Comments
 (0)