1
1
#pragma once
2
2
#include < cmath>
3
3
#include < stdexcept>
4
+ #include < thread>
4
5
#include < vector>
5
6
6
7
#include " layers/Layer.hpp"
@@ -143,36 +144,58 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
143
144
std::vector<std::vector<std::vector<ValueType>>>(
144
145
in_height, std::vector<std::vector<ValueType>>(
145
146
in_width, std::vector<ValueType>(in_channels, 1 ))));
146
- for (size_t n = 0 ; n < batch_size; n++) {
147
- for (size_t c = 0 ; c < in_channels; c++) {
148
- for (size_t h = 0 ; h < in_height; h++) {
149
- for (size_t w = 0 ; w < in_width; w++) {
150
- input_tensor[n][h][w][c] = input.get <ValueType>({n, c, h, w});
147
+
148
+ auto init_input = [&](size_t start_b, size_t end_b) {
149
+ for (size_t n = start_b; n < end_b; n++) {
150
+ for (size_t c = 0 ; c < in_channels; c++) {
151
+ for (size_t h = 0 ; h < in_height; h++) {
152
+ for (size_t w = 0 ; w < in_width; w++) {
153
+ input_tensor[n][h][w][c] = input.get <ValueType>({n, c, h, w});
154
+ }
151
155
}
152
156
}
153
157
}
154
- }
155
- // adapt input
158
+ };
156
159
157
160
std::vector<std::vector<std::vector<std::vector<ValueType>>>> kernel (
158
161
kernel_height,
159
162
std::vector<std::vector<std::vector<ValueType>>>(
160
163
kernel_width, std::vector<std::vector<ValueType>>(
161
164
kernel_in_channels,
162
165
std::vector<ValueType>(kernel_out_channels, 1 ))));
163
- for (size_t h = 0 ; h < kernel_height; h++) {
164
- for (size_t w = 0 ; w < kernel_width; w++) {
165
- for (size_t n = 0 ; n < kernel_in_channels; n++) {
166
- for (size_t c = 0 ; c < kernel_out_channels; c++) {
167
- kernel[h][w][n][c] = kernel_.get <ValueType>({h, w, n, c});
166
+
167
+ auto init_kernel = [&](size_t start_h, size_t end_h) {
168
+ for (size_t h = start_h; h < end_h; h++) {
169
+ for (size_t w = 0 ; w < kernel_width; w++) {
170
+ for (size_t n = 0 ; n < kernel_in_channels; n++) {
171
+ for (size_t c = 0 ; c < kernel_out_channels; c++) {
172
+ kernel[h][w][n][c] = kernel_.get <ValueType>({h, w, n, c});
173
+ }
168
174
}
169
175
}
170
176
}
177
+ };
178
+
179
+ unsigned num_threads = std::thread::hardware_concurrency ();
180
+ std::vector<std::thread> threads;
181
+ size_t chunk_size = batch_size / num_threads;
182
+
183
+ for (unsigned i = 0 ; i < num_threads; ++i) {
184
+ size_t start = i * chunk_size;
185
+ size_t end = (i == num_threads - 1 ) ? batch_size : start + chunk_size;
186
+ threads.emplace_back (init_input, start, end);
171
187
}
172
- // adapt kernel
188
+ for (auto & t : threads) t.join ();
189
+ threads.clear ();
173
190
174
- // pads_ = (kernel_height * dilations_ + 1 - dilations_) / 2;
175
- // ???
191
+ chunk_size = kernel_height / num_threads;
192
+ for (unsigned i = 0 ; i < num_threads; ++i) {
193
+ size_t start = i * chunk_size;
194
+ size_t end = (i == num_threads - 1 ) ? kernel_height : start + chunk_size;
195
+ threads.emplace_back (init_kernel, start, end);
196
+ }
197
+ for (auto & t : threads) t.join ();
198
+ threads.clear ();
176
199
177
200
std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input =
178
201
input_tensor;
@@ -185,19 +208,28 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
185
208
in_width + 2 * pads_,
186
209
std::vector<ValueType>(in_channels, 0 ))));
187
210
188
- for (size_t b = 0 ; b < batch_size; ++b) {
189
- for (size_t h = 0 ; h < in_height; ++h) {
190
- for (size_t w = 0 ; w < in_width; ++w) {
191
- for (size_t c = 0 ; c < in_channels; ++c) {
192
- padded_input[b][h + pads_][w + pads_][c] = input_tensor[b][h][w][c];
211
+ auto pad_input = [&](size_t start_b, size_t end_b) {
212
+ for (size_t b = start_b; b < end_b; ++b) {
213
+ for (size_t h = 0 ; h < in_height; ++h) {
214
+ for (size_t w = 0 ; w < in_width; ++w) {
215
+ for (size_t c = 0 ; c < in_channels; ++c) {
216
+ padded_input[b][h + pads_][w + pads_][c] =
217
+ input_tensor[b][h][w][c];
218
+ }
193
219
}
194
220
}
195
221
}
222
+ };
223
+
224
+ chunk_size = batch_size / num_threads;
225
+ for (unsigned i = 0 ; i < num_threads; ++i) {
226
+ size_t start = i * chunk_size;
227
+ size_t end = (i == num_threads - 1 ) ? batch_size : start + chunk_size;
228
+ threads.emplace_back (pad_input, start, end);
196
229
}
230
+ for (auto & t : threads) t.join ();
231
+ threads.clear ();
197
232
}
198
- // | | | | |
199
- // | data |
200
- // | | | | |
201
233
202
234
std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel =
203
235
kernel;
@@ -210,16 +242,28 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
210
242
kernel_in_channels,
211
243
std::vector<ValueType>(kernel_out_channels, 0 ))));
212
244
213
- for (size_t b = 0 ; b < kernel_out_channels; ++b) {
214
- for (size_t h = 0 ; h < kernel_height; ++h) {
215
- for (size_t w = 0 ; w < kernel_width; ++w) {
216
- for (size_t c = 0 ; c < kernel_in_channels; ++c) {
217
- dil_kernel[h * dilations_][w * dilations_][c][b] =
218
- kernel[h][w][c][b];
245
+ auto dilate_kernel = [&](size_t start_b, size_t end_b) {
246
+ for (size_t b = start_b; b < end_b; ++b) {
247
+ for (size_t h = 0 ; h < kernel_height; ++h) {
248
+ for (size_t w = 0 ; w < kernel_width; ++w) {
249
+ for (size_t c = 0 ; c < kernel_in_channels; ++c) {
250
+ dil_kernel[h * dilations_][w * dilations_][c][b] =
251
+ kernel[h][w][c][b];
252
+ }
219
253
}
220
254
}
221
255
}
256
+ };
257
+
258
+ chunk_size = kernel_out_channels / num_threads;
259
+ for (unsigned i = 0 ; i < num_threads; ++i) {
260
+ size_t start = i * chunk_size;
261
+ size_t end =
262
+ (i == num_threads - 1 ) ? kernel_out_channels : start + chunk_size;
263
+ threads.emplace_back (dilate_kernel, start, end);
222
264
}
265
+ for (auto & t : threads) t.join ();
266
+ threads.clear ();
223
267
}
224
268
225
269
size_t crat = 0 ;
@@ -231,7 +275,6 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
231
275
crat;
232
276
233
277
crat = 0 ;
234
-
235
278
if ((in_width + 2 * pads_ - dilations_ * (kernel_width - 1 )) % stride_ != 0 )
236
279
crat = 1 ;
237
280
@@ -244,44 +287,67 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
244
287
std::vector<std::vector<ValueType>>(
245
288
out_height, std::vector<ValueType>(out_width, 0 ))));
246
289
247
- for (size_t b = 0 ; b < batch_size; ++b) {
248
- for (size_t c = 0 ; c < kernel_out_channels; ++c) {
249
- for (size_t i = 0 ; i < out_height; i += stride_) {
250
- for (size_t j = 0 ; j < out_width; j += stride_) {
251
- ValueType value = 0 ;
252
- for (size_t ic = 0 ; ic < in_channels; ++ic) {
253
- for (size_t h = 0 ; h < kernel_height * dilations_ + 1 - dilations_;
254
- ++h) {
255
- for (size_t w = 0 ; w < kernel_width * dilations_ + 1 - dilations_;
256
- ++w) {
257
- value +=
258
- padded_input[b][i + h][j + w][ic] * dil_kernel[h][w][ic][c];
290
+ auto compute_conv = [&](size_t start_b, size_t end_b) {
291
+ for (size_t b = start_b; b < end_b; ++b) {
292
+ for (size_t c = 0 ; c < kernel_out_channels; ++c) {
293
+ for (size_t i = 0 ; i < out_height; i += stride_) {
294
+ for (size_t j = 0 ; j < out_width; j += stride_) {
295
+ ValueType value = 0 ;
296
+ for (size_t ic = 0 ; ic < in_channels; ++ic) {
297
+ for (size_t h = 0 ;
298
+ h < kernel_height * dilations_ + 1 - dilations_; ++h) {
299
+ for (size_t w = 0 ;
300
+ w < kernel_width * dilations_ + 1 - dilations_; ++w) {
301
+ value += padded_input[b][i + h][j + w][ic] *
302
+ dil_kernel[h][w][ic][c];
303
+ }
259
304
}
260
305
}
261
- }
262
- if (! bias_.empty ()) {
263
- output_tensor[b][c][i][j] = value + (*bias_. as <ValueType>())[c];
264
- } else {
265
- output_tensor[b][c][i][j] = value;
306
+ if (!bias_. empty ()) {
307
+ output_tensor[b][c][i][j] = value + (* bias_.as <ValueType> ())[c];
308
+ } else {
309
+ output_tensor[b][c][i][j] = value;
310
+ }
266
311
}
267
312
}
268
313
}
269
314
}
315
+ };
316
+
317
+ chunk_size = batch_size / num_threads;
318
+ for (unsigned i = 0 ; i < num_threads; ++i) {
319
+ size_t start = i * chunk_size;
320
+ size_t end = (i == num_threads - 1 ) ? batch_size : start + chunk_size;
321
+ threads.emplace_back (compute_conv, start, end);
270
322
}
323
+ for (auto & t : threads) t.join ();
324
+ threads.clear ();
271
325
272
326
Shape sh ({batch_size, kernel_out_channels, out_height, out_width});
273
327
std::vector<ValueType> one_d_vector (batch_size * out_height * out_width *
274
328
kernel_out_channels);
275
- size_t index_1d = 0 ;
276
- for (size_t i = 0 ; i < batch_size; ++i) {
277
- for (size_t l = 0 ; l < kernel_out_channels; ++l) {
278
- for (size_t j = 0 ; j < out_height; ++j) {
279
- for (size_t k = 0 ; k < out_width; ++k) {
280
- one_d_vector[index_1d++] = output_tensor[i][l][j][k];
329
+
330
+ auto flatten_output = [&](size_t start_b, size_t end_b) {
331
+ size_t index_1d = start_b * kernel_out_channels * out_height * out_width;
332
+ for (size_t i = start_b; i < end_b; ++i) {
333
+ for (size_t l = 0 ; l < kernel_out_channels; ++l) {
334
+ for (size_t j = 0 ; j < out_height; ++j) {
335
+ for (size_t k = 0 ; k < out_width; ++k) {
336
+ one_d_vector[index_1d++] = output_tensor[i][l][j][k];
337
+ }
281
338
}
282
339
}
283
340
}
341
+ };
342
+
343
+ chunk_size = batch_size / num_threads;
344
+ for (unsigned i = 0 ; i < num_threads; ++i) {
345
+ size_t start = i * chunk_size;
346
+ size_t end = (i == num_threads - 1 ) ? batch_size : start + chunk_size;
347
+ threads.emplace_back (flatten_output, start, end);
284
348
}
349
+ for (auto & t : threads) t.join ();
350
+
285
351
output = make_tensor<ValueType>(one_d_vector, sh);
286
352
}
287
353
0 commit comments