Skip to content

Commit 9034766

Browse files
AndreySorokin7AndreySorokin7
andauthored
Adding 4d input logic, fixing fclayer for this, adding a new soft max for a large number of images, parallelizing convlayer (#170)
Co-authored-by: AndreySorokin7 <andrey_sorokin_nn@mail,ru>
1 parent 37ba43d commit 9034766

File tree

8 files changed

+297
-130
lines changed

8 files changed

+297
-130
lines changed

README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,16 @@ To build and run this project locally on Windows, follow these steps:
2323
mkdir build
2424
cd build
2525
cmake .. -DCMAKE_BUILD_TYPE=Release
26-
If you want to build in a debug, change the release to debug
2726
```
28-
*Note: Make sure you have CMake installed to build the project.*
29-
4. **Build the project:**
27+
If you want to build in a Debug, change the Release to Debug
28+
29+
*Note: Make sure you have CMake installed to build the project.*
30+
5. **Build the project:**
3031
Next, to build the project, we will need to enter the command
3132
```bash
3233
cmake --build . --config Release
3334
```
34-
5. **Run the project**
35+
6. **Run the project**
3536
After building the project, you can find the executable file in the following path from the *build* folder
3637
```bash
3738
cd app\Release
@@ -77,7 +78,7 @@ To build and run this project locally on Windows, follow these steps:
7778
```bash
7879
cmake --build build --config Release
7980
```
80-
If you want to build in a debug, change the release to debug
81+
If you want to build in a Debug, change the Release to Debug
8182
6. **Run the project**
8283
After building the project, you can find the executable file in the following path from the *build* folder
8384
```bash

app/Graph/acc_check_mnist.cpp

Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,17 @@ int main() {
1111
std::vector<size_t> counts = {979, 1134, 1031, 1009, 981,
1212
891, 957, 1027, 973, 1008};
1313
int stat = 0;
14+
size_t sum = std::accumulate(counts.begin(), counts.end(), size_t{0});
15+
int count_pic = static_cast<int>(sum) + 10;
16+
std::vector<float> res(count_pic * 28 * 28);
17+
Tensor input;
18+
Shape sh1({1, 5, 5, 3});
19+
std::vector<float> vec;
20+
vec.reserve(75);
21+
for (int i = 0; i < 75; ++i) {
22+
vec.push_back(3);
23+
}
24+
Tensor output = make_tensor(vec, sh1);
1425

1526
for (size_t name = 0; name < 10; name++) {
1627
for (size_t ind = 0; ind < counts[name] + 1; ind++) {
@@ -19,7 +30,6 @@ int main() {
1930
<< ".png";
2031
std::string png = oss.str();
2132
std::string image_path = MNIST_PATH + png;
22-
std::cout << image_path << std::endl;
2333

2434
cv::Mat image = cv::imread(image_path);
2535
if (image.empty()) {
@@ -28,37 +38,39 @@ int main() {
2838
cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
2939
std::vector<cv::Mat> channels;
3040
cv::split(image, channels);
31-
int count_pic = 1;
32-
std::vector<float> res(count_pic * 28 * 28);
3341
for (int i = 0; i < 28; ++i) {
3442
for (int j = 0; j < 28; ++j) {
35-
res[i * 28 + j] = channels[0].at<uchar>(j, i);
43+
size_t a = ind;
44+
for (size_t n = 0; n < name; n++) a += counts[n] + 1;
45+
res[(a) * 28 * 28 + i * 28 + j] = channels[0].at<uchar>(j, i);
3646
}
3747
}
38-
Shape sh({static_cast<size_t>(count_pic), 1, 28, 28});
39-
Tensor t = make_tensor<float>(res, sh);
40-
Tensor input = t;
41-
Shape sh1({1, 5, 5, 3});
42-
std::vector<float> vec;
43-
vec.reserve(75);
44-
for (int i = 0; i < 75; ++i) {
45-
vec.push_back(3);
46-
}
47-
Tensor output = make_tensor(vec, sh1);
48-
build_graph(input, output, false);
49-
std::vector<float> tmp_output = softmax<float>(*output.as<float>());
50-
for (size_t i = 0; i < tmp_output.size(); i++) {
51-
if (tmp_output[i] >= 1e-6) {
52-
if (i == name) stat++;
53-
}
48+
}
49+
}
50+
Shape sh({static_cast<size_t>(count_pic), 1, 28, 28});
51+
Tensor t = make_tensor<float>(res, sh);
52+
input = t;
53+
build_graph(input, output, false);
54+
std::vector<std::vector<float>> tmp_output =
55+
softmax<float>(*output.as<float>(), 10);
56+
std::vector<size_t> indices;
57+
for (const auto& row : tmp_output) {
58+
for (size_t j = 0; j < row.size(); ++j) {
59+
if (row[j] >= 1e-6) {
60+
indices.push_back(j);
61+
break;
5462
}
5563
}
5664
}
57-
58-
size_t sum = std::accumulate(counts.begin(), counts.end(), size_t{0});
65+
for (size_t name = 0; name < 10; name++) {
66+
for (size_t ind = 0; ind < counts[name] + 1; ind++) {
67+
size_t a = ind;
68+
for (size_t n = 0; n < name; n++) a += counts[n] + 1;
69+
if (name == indices[a]) stat++;
70+
}
71+
}
5972
double percentage =
6073
(static_cast<double>(stat) / static_cast<double>(sum + 10)) * 100;
6174
std::cout << "Stat: " << std::fixed << std::setprecision(2) << percentage
6275
<< "%" << std::endl;
63-
std::cout << percentage << std::endl;
6476
}

include/layers/ConvLayer.hpp

Lines changed: 119 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22
#include <cmath>
33
#include <stdexcept>
4+
#include <thread>
45
#include <vector>
56

67
#include "layers/Layer.hpp"
@@ -143,36 +144,58 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
143144
std::vector<std::vector<std::vector<ValueType>>>(
144145
in_height, std::vector<std::vector<ValueType>>(
145146
in_width, std::vector<ValueType>(in_channels, 1))));
146-
for (size_t n = 0; n < batch_size; n++) {
147-
for (size_t c = 0; c < in_channels; c++) {
148-
for (size_t h = 0; h < in_height; h++) {
149-
for (size_t w = 0; w < in_width; w++) {
150-
input_tensor[n][h][w][c] = input.get<ValueType>({n, c, h, w});
147+
148+
auto init_input = [&](size_t start_b, size_t end_b) {
149+
for (size_t n = start_b; n < end_b; n++) {
150+
for (size_t c = 0; c < in_channels; c++) {
151+
for (size_t h = 0; h < in_height; h++) {
152+
for (size_t w = 0; w < in_width; w++) {
153+
input_tensor[n][h][w][c] = input.get<ValueType>({n, c, h, w});
154+
}
151155
}
152156
}
153157
}
154-
}
155-
// adapt input
158+
};
156159

157160
std::vector<std::vector<std::vector<std::vector<ValueType>>>> kernel(
158161
kernel_height,
159162
std::vector<std::vector<std::vector<ValueType>>>(
160163
kernel_width, std::vector<std::vector<ValueType>>(
161164
kernel_in_channels,
162165
std::vector<ValueType>(kernel_out_channels, 1))));
163-
for (size_t h = 0; h < kernel_height; h++) {
164-
for (size_t w = 0; w < kernel_width; w++) {
165-
for (size_t n = 0; n < kernel_in_channels; n++) {
166-
for (size_t c = 0; c < kernel_out_channels; c++) {
167-
kernel[h][w][n][c] = kernel_.get<ValueType>({h, w, n, c});
166+
167+
auto init_kernel = [&](size_t start_h, size_t end_h) {
168+
for (size_t h = start_h; h < end_h; h++) {
169+
for (size_t w = 0; w < kernel_width; w++) {
170+
for (size_t n = 0; n < kernel_in_channels; n++) {
171+
for (size_t c = 0; c < kernel_out_channels; c++) {
172+
kernel[h][w][n][c] = kernel_.get<ValueType>({h, w, n, c});
173+
}
168174
}
169175
}
170176
}
177+
};
178+
179+
unsigned num_threads = std::thread::hardware_concurrency();
180+
std::vector<std::thread> threads;
181+
size_t chunk_size = batch_size / num_threads;
182+
183+
for (unsigned i = 0; i < num_threads; ++i) {
184+
size_t start = i * chunk_size;
185+
size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size;
186+
threads.emplace_back(init_input, start, end);
171187
}
172-
// adapt kernel
188+
for (auto& t : threads) t.join();
189+
threads.clear();
173190

174-
// pads_ = (kernel_height * dilations_ + 1 - dilations_) / 2;
175-
// ???
191+
chunk_size = kernel_height / num_threads;
192+
for (unsigned i = 0; i < num_threads; ++i) {
193+
size_t start = i * chunk_size;
194+
size_t end = (i == num_threads - 1) ? kernel_height : start + chunk_size;
195+
threads.emplace_back(init_kernel, start, end);
196+
}
197+
for (auto& t : threads) t.join();
198+
threads.clear();
176199

177200
std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input =
178201
input_tensor;
@@ -185,19 +208,28 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
185208
in_width + 2 * pads_,
186209
std::vector<ValueType>(in_channels, 0))));
187210

188-
for (size_t b = 0; b < batch_size; ++b) {
189-
for (size_t h = 0; h < in_height; ++h) {
190-
for (size_t w = 0; w < in_width; ++w) {
191-
for (size_t c = 0; c < in_channels; ++c) {
192-
padded_input[b][h + pads_][w + pads_][c] = input_tensor[b][h][w][c];
211+
auto pad_input = [&](size_t start_b, size_t end_b) {
212+
for (size_t b = start_b; b < end_b; ++b) {
213+
for (size_t h = 0; h < in_height; ++h) {
214+
for (size_t w = 0; w < in_width; ++w) {
215+
for (size_t c = 0; c < in_channels; ++c) {
216+
padded_input[b][h + pads_][w + pads_][c] =
217+
input_tensor[b][h][w][c];
218+
}
193219
}
194220
}
195221
}
222+
};
223+
224+
chunk_size = batch_size / num_threads;
225+
for (unsigned i = 0; i < num_threads; ++i) {
226+
size_t start = i * chunk_size;
227+
size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size;
228+
threads.emplace_back(pad_input, start, end);
196229
}
230+
for (auto& t : threads) t.join();
231+
threads.clear();
197232
}
198-
// | | | | |
199-
// | data |
200-
// | | | | |
201233

202234
std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel =
203235
kernel;
@@ -210,16 +242,28 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
210242
kernel_in_channels,
211243
std::vector<ValueType>(kernel_out_channels, 0))));
212244

213-
for (size_t b = 0; b < kernel_out_channels; ++b) {
214-
for (size_t h = 0; h < kernel_height; ++h) {
215-
for (size_t w = 0; w < kernel_width; ++w) {
216-
for (size_t c = 0; c < kernel_in_channels; ++c) {
217-
dil_kernel[h * dilations_][w * dilations_][c][b] =
218-
kernel[h][w][c][b];
245+
auto dilate_kernel = [&](size_t start_b, size_t end_b) {
246+
for (size_t b = start_b; b < end_b; ++b) {
247+
for (size_t h = 0; h < kernel_height; ++h) {
248+
for (size_t w = 0; w < kernel_width; ++w) {
249+
for (size_t c = 0; c < kernel_in_channels; ++c) {
250+
dil_kernel[h * dilations_][w * dilations_][c][b] =
251+
kernel[h][w][c][b];
252+
}
219253
}
220254
}
221255
}
256+
};
257+
258+
chunk_size = kernel_out_channels / num_threads;
259+
for (unsigned i = 0; i < num_threads; ++i) {
260+
size_t start = i * chunk_size;
261+
size_t end =
262+
(i == num_threads - 1) ? kernel_out_channels : start + chunk_size;
263+
threads.emplace_back(dilate_kernel, start, end);
222264
}
265+
for (auto& t : threads) t.join();
266+
threads.clear();
223267
}
224268

225269
size_t crat = 0;
@@ -231,7 +275,6 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
231275
crat;
232276

233277
crat = 0;
234-
235278
if ((in_width + 2 * pads_ - dilations_ * (kernel_width - 1)) % stride_ != 0)
236279
crat = 1;
237280

@@ -244,44 +287,67 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
244287
std::vector<std::vector<ValueType>>(
245288
out_height, std::vector<ValueType>(out_width, 0))));
246289

247-
for (size_t b = 0; b < batch_size; ++b) {
248-
for (size_t c = 0; c < kernel_out_channels; ++c) {
249-
for (size_t i = 0; i < out_height; i += stride_) {
250-
for (size_t j = 0; j < out_width; j += stride_) {
251-
ValueType value = 0;
252-
for (size_t ic = 0; ic < in_channels; ++ic) {
253-
for (size_t h = 0; h < kernel_height * dilations_ + 1 - dilations_;
254-
++h) {
255-
for (size_t w = 0; w < kernel_width * dilations_ + 1 - dilations_;
256-
++w) {
257-
value +=
258-
padded_input[b][i + h][j + w][ic] * dil_kernel[h][w][ic][c];
290+
auto compute_conv = [&](size_t start_b, size_t end_b) {
291+
for (size_t b = start_b; b < end_b; ++b) {
292+
for (size_t c = 0; c < kernel_out_channels; ++c) {
293+
for (size_t i = 0; i < out_height; i += stride_) {
294+
for (size_t j = 0; j < out_width; j += stride_) {
295+
ValueType value = 0;
296+
for (size_t ic = 0; ic < in_channels; ++ic) {
297+
for (size_t h = 0;
298+
h < kernel_height * dilations_ + 1 - dilations_; ++h) {
299+
for (size_t w = 0;
300+
w < kernel_width * dilations_ + 1 - dilations_; ++w) {
301+
value += padded_input[b][i + h][j + w][ic] *
302+
dil_kernel[h][w][ic][c];
303+
}
259304
}
260305
}
261-
}
262-
if (!bias_.empty()) {
263-
output_tensor[b][c][i][j] = value + (*bias_.as<ValueType>())[c];
264-
} else {
265-
output_tensor[b][c][i][j] = value;
306+
if (!bias_.empty()) {
307+
output_tensor[b][c][i][j] = value + (*bias_.as<ValueType>())[c];
308+
} else {
309+
output_tensor[b][c][i][j] = value;
310+
}
266311
}
267312
}
268313
}
269314
}
315+
};
316+
317+
chunk_size = batch_size / num_threads;
318+
for (unsigned i = 0; i < num_threads; ++i) {
319+
size_t start = i * chunk_size;
320+
size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size;
321+
threads.emplace_back(compute_conv, start, end);
270322
}
323+
for (auto& t : threads) t.join();
324+
threads.clear();
271325

272326
Shape sh({batch_size, kernel_out_channels, out_height, out_width});
273327
std::vector<ValueType> one_d_vector(batch_size * out_height * out_width *
274328
kernel_out_channels);
275-
size_t index_1d = 0;
276-
for (size_t i = 0; i < batch_size; ++i) {
277-
for (size_t l = 0; l < kernel_out_channels; ++l) {
278-
for (size_t j = 0; j < out_height; ++j) {
279-
for (size_t k = 0; k < out_width; ++k) {
280-
one_d_vector[index_1d++] = output_tensor[i][l][j][k];
329+
330+
auto flatten_output = [&](size_t start_b, size_t end_b) {
331+
size_t index_1d = start_b * kernel_out_channels * out_height * out_width;
332+
for (size_t i = start_b; i < end_b; ++i) {
333+
for (size_t l = 0; l < kernel_out_channels; ++l) {
334+
for (size_t j = 0; j < out_height; ++j) {
335+
for (size_t k = 0; k < out_width; ++k) {
336+
one_d_vector[index_1d++] = output_tensor[i][l][j][k];
337+
}
281338
}
282339
}
283340
}
341+
};
342+
343+
chunk_size = batch_size / num_threads;
344+
for (unsigned i = 0; i < num_threads; ++i) {
345+
size_t start = i * chunk_size;
346+
size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size;
347+
threads.emplace_back(flatten_output, start, end);
284348
}
349+
for (auto& t : threads) t.join();
350+
285351
output = make_tensor<ValueType>(one_d_vector, sh);
286352
}
287353

0 commit comments

Comments
 (0)