Skip to content

Commit 5867129

Browse files
Esteban Padilla Cerdiofacebook-github-bot
authored andcommitted
Add metric for 3D texture max concurrent cache read (#4421)
Summary: Pull Request resolved: #4421 This diff introduces a metric to calculate the maximum concurrent cache line accesses for each dimension of a 3D texture. The experiment works by allowing each thread to access a different texel on the texture and slowly increasing the number of threads, until the cache line is no longer able to handle all simultaneous accesses. By detecting a jump in latency, we can define the optimal maximum size that can be accessed concurrently on each dimension. NOTE: ArchProbe uses this information to[ obtain a supposed cache line size for textures](https://fburl.com/98xiou3g). However, it is unclear why they define the cache line size as being the ratio between the larger concurrency value over the lower, times the texel size. It is also unclear how to extend their calculations to three dimensions. TODO: Understand the relationship between concurrency and cache line size, and modify this metric to output the cache line size. For a Samsung S22, the latency graph looks like this: {F1780375117} Reviewed By: copyrightly Differential Revision: D60246121 fbshipit-source-id: c2bac010077bf14e95f70bb6038acbb47a534dde
1 parent 298b625 commit 5867129

File tree

4 files changed

+155
-1
lines changed

4 files changed

+155
-1
lines changed

backends/vulkan/tools/gpuinfo/config.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,10 @@
3939
"nflush": 4,
4040
"nunroll": 16,
4141
"niter": 10
42+
},
43+
"tex_cacheline_concurr": {
44+
"enabled": true,
45+
"threshold": 3,
46+
"compensate": 0.1
4247
}
4348
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
#define VEC4_T ${texel_type(DTYPE)}
13+
14+
layout(std430) buffer;
15+
16+
${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
17+
${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
18+
19+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
20+
21+
layout(constant_id = 3) const int niter = 1;
22+
23+
void main() {
24+
vec4 sum = vec4(0);
25+
int i = 0;
26+
for (; i < niter; ++i){
27+
$if DIM == 0:
28+
sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
29+
$elif DIM == 1:
30+
sum += texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
31+
$elif DIM == 2:
32+
sum += texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
33+
}
34+
35+
// This is to ensure no compiler optimizations occur
36+
vec4 zero = vec4(i>>31);
37+
38+
out_buf[0] = sum + zero;
39+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
tex_cacheline_concurr:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
generate_variant_forall:
11+
DIM:
12+
- RANGE: [0, 2]
13+
shader_variants:
14+
- NAME: tex_cacheline_concurr

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,107 @@ class App {
291291
if (stride >= MAX_STRIDE) {
292292
std::cout << "Unable to conclude a top level buffer cacheline size."
293293
<< std::endl;
294-
cacheline_size = MAX_STRIDE;
294+
cacheline_size = MAX_STRIDE * sizeof(float);
295295
}
296296

297297
std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
298298
}
299299

300+
// Textures are drastically different from buffers in terms of data layout.
301+
// While buffers are a contiguous range of memory, textures are opaque objects
302+
// defined by the vendor and it is possible that nearby points of data are not
303+
// neighboring in memory. Likewise, data points are accessed in
304+
// multi-dimensional patches instead of simple lines. This makes the stride
305+
// method for figuring out the cache line size not applicable. To go around
306+
// this, this experiment runs an increasing amount of threads accessing
307+
// different datapoints in the texture and measures latency. If the cache line
308+
// is big enough to contain all requested data for the amount of threads,
309+
// latency will be low. When there are more threads and hence more data than
310+
// what a single cache line can handle, a second line must be fetched,
311+
// increasing latency in a measurable way.
312+
void tex_cacheline_concurr() {
313+
if (!_enabled("tex_cacheline_concurr")) {
314+
std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
315+
return;
316+
}
317+
318+
const uint32_t TEXEL_WIDTH = 4;
319+
const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
320+
321+
const double COMPENSATE =
322+
_get_config("tex_cacheline_concurr", "compensate");
323+
const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold");
324+
325+
for (int dim = 0; dim < 3; ++dim) {
326+
std::cout << std::endl;
327+
std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
328+
<< ") ------" << std::endl;
329+
330+
uint32_t NITER;
331+
332+
const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
333+
: dim == 1 ? max_tex_height_
334+
: max_tex_depth_;
335+
336+
const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
337+
338+
auto bench = [&](uint32_t nthread) {
339+
std::vector<int64_t> sizes_whd = {
340+
max_tex_width_, max_tex_height_, max_tex_depth_};
341+
342+
auto sizes_nchw = _whd_to_nchw(sizes_whd);
343+
344+
vTensor in_tensor =
345+
api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
346+
347+
StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
348+
349+
vkapi::PipelineBarrier pipeline_barrier{};
350+
351+
auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
352+
353+
auto time = benchmark_on_gpu(shader_name, 100, [&]() {
354+
context()->submit_compute_job(
355+
VK_KERNEL_FROM_STR(shader_name),
356+
pipeline_barrier,
357+
{nthread, 1, 1},
358+
{nthread, 1, 1},
359+
{SV(NITER)},
360+
VK_NULL_HANDLE,
361+
0,
362+
in_tensor.image(),
363+
out_buf.buffer());
364+
});
365+
return time;
366+
};
367+
368+
ensure_min_niter(1000, NITER, [&]() { return bench(1); });
369+
370+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
371+
uint32_t nthread = 1;
372+
for (; nthread <= MAX_NTHREAD; ++nthread) {
373+
double time = bench(nthread);
374+
std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
375+
<< std::endl;
376+
377+
if (dj.push(time)) {
378+
auto max_concurrency = nthread - 1;
379+
std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
380+
<< max_concurrency * TEXEL_SIZE << std::endl;
381+
break;
382+
}
383+
}
384+
if (nthread >= MAX_NTHREAD) {
385+
std::cout
386+
<< "Unable to conclude an optimal texture cacheline concurrency for dim "
387+
<< dim << std::endl;
388+
};
389+
}
390+
391+
// TODO: Use concurrency information to obtain the cache line size for
392+
// textures as done in https://fburl.com/98xiou3g
393+
}
394+
300395
private:
301396
void _bandwidth(std::string memtype, uint32_t range) {
302397
auto memtype_lower = memtype;
@@ -689,6 +784,7 @@ int main(int argc, const char** argv) {
689784
app.shared_mem_bandwidth();
690785
app.warp_size();
691786
app.tex_bandwidth();
787+
app.tex_cacheline_concurr();
692788

693789
return 0;
694790
}

0 commit comments

Comments
 (0)