2525namespace HPCombi {
2626static_assert (std::is_trivial<BMat16>(), " BMat16 is not a trivial class!" );
2727
28- static constexpr xpu16 line{0x800 , 0x901 , 0xa02 , 0xb03 , 0xc04 , 0xd05 , 0xe06 , 0xf07 , 0x800 , 0x901 , 0xa02 , 0xb03 , 0xc04 , 0xd05 , 0xe06 , 0xf07 };
29- static constexpr xpu16 block{0x200 , 0x604 , 0xa08 , 0xe0c , 0x301 , 0x705 , 0xb09 , 0xf0d , 0x200 , 0x604 , 0xa08 , 0xe0c , 0x301 , 0x705 , 0xb09 , 0xf0d };
28+ static constexpr xpu16 line{0x800 , 0x901 , 0xa02 , 0xb03 , 0xc04 , 0xd05 ,
29+ 0xe06 , 0xf07 , 0x800 , 0x901 , 0xa02 , 0xb03 ,
30+ 0xc04 , 0xd05 , 0xe06 , 0xf07 };
31+ static constexpr xpu16 block{0x200 , 0x604 , 0xa08 , 0xe0c , 0x301 , 0x705 ,
32+ 0xb09 , 0xf0d , 0x200 , 0x604 , 0xa08 , 0xe0c ,
33+ 0x301 , 0x705 , 0xb09 , 0xf0d };
3034
3135inline xpu64 to_line (xpu64 vect) {
3236 return simde_mm256_shuffle_epi8 (vect, line);
@@ -36,7 +40,8 @@ inline xpu64 to_block(xpu64 vect) {
3640 return simde_mm256_shuffle_epi8 (vect, block);
3741}
3842
39- inline BMat16::BMat16 (uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept {
43+ inline BMat16::BMat16 (uint64_t n0, uint64_t n1, uint64_t n2,
44+ uint64_t n3) noexcept {
4045 xpu64 tmp{n0, n1, n2, n3};
4146 _data = to_line (tmp);
4247}
@@ -47,27 +52,25 @@ inline BMat16::BMat16(std::vector<std::vector<bool>> const &mat) noexcept {
4752 std::array<uint64_t , 4 > tmp = {0 , 0 , 0 , 0 };
4853 for (int i = mat.size () - 1 ; i >= 0 ; --i) {
4954 HPCOMBI_ASSERT (mat.size () == mat[i].size ());
50- tmp[i/ 4 ] <<= 16 - mat.size ();
55+ tmp[i / 4 ] <<= 16 - mat.size ();
5156 for (int j = mat[i].size () - 1 ; j >= 0 ; --j) {
52- tmp[i/ 4 ] = (tmp[i/ 4 ] << 1 ) | mat[i][j];
57+ tmp[i / 4 ] = (tmp[i / 4 ] << 1 ) | mat[i][j];
5358 }
5459 }
5560 _data = xpu64{tmp[0 ], tmp[1 ], tmp[2 ], tmp[3 ]};
5661}
5762
5863inline bool BMat16::operator ()(size_t i, size_t j) const noexcept {
59- return (_data[i/ 4 ] >> (16 * (i% 4 ) + j)) & 1 ;
64+ return (_data[i / 4 ] >> (16 * (i % 4 ) + j)) & 1 ;
6065}
6166
6267inline void BMat16::set (size_t i, size_t j, bool val) noexcept {
6368 HPCOMBI_ASSERT (i < 16 );
6469 HPCOMBI_ASSERT (j < 16 );
6570 uint64_t a = 1 ;
66- a <<= 16 * (i%4 ) + j;
67- xpu64 mask{(i/4 == 0 )*a,
68- (i/4 == 1 )*a,
69- (i/4 == 2 )*a,
70- (i/4 == 3 )*a};
71+ a <<= 16 * (i % 4 ) + j;
72+ xpu64 mask{(i / 4 == 0 ) * a, (i / 4 == 1 ) * a, (i / 4 == 2 ) * a,
73+ (i / 4 == 3 ) * a};
7174 _data ^= (-val ^ _data) & mask;
7275}
7376
@@ -77,28 +80,36 @@ inline bool BMat16::operator==(BMat16 const &that) const noexcept {
7780}
7881
7982inline bool BMat16::operator <(BMat16 const &that) const noexcept {
80- return _data[0 ] < that._data [0 ] ||
81- (_data[0 ] == that._data [0 ] && (_data[1 ] < that._data [1 ] ||
82- (_data[1 ] == that._data [1 ] && (_data[2 ] < that._data [2 ] ||
83- (_data[2 ] == that._data [2 ] && (_data[3 ] < that._data [3 ]))))));
83+ return _data[0 ] < that._data [0 ] ||
84+ (_data[0 ] == that._data [0 ] &&
85+ (_data[1 ] < that._data [1 ] ||
86+ (_data[1 ] == that._data [1 ] &&
87+ (_data[2 ] < that._data [2 ] ||
88+ (_data[2 ] == that._data [2 ] && (_data[3 ] < that._data [3 ]))))));
8489}
8590
8691inline bool BMat16::operator >(BMat16 const &that) const noexcept {
87- return _data[0 ] > that._data [0 ] ||
88- (_data[0 ] == that._data [0 ] && (_data[1 ] > that._data [1 ] ||
89- (_data[1 ] == that._data [1 ] && (_data[2 ] > that._data [2 ] ||
90- (_data[2 ] == that._data [2 ] && (_data[3 ] > that._data [3 ]))))));
92+ return _data[0 ] > that._data [0 ] ||
93+ (_data[0 ] == that._data [0 ] &&
94+ (_data[1 ] > that._data [1 ] ||
95+ (_data[1 ] == that._data [1 ] &&
96+ (_data[2 ] > that._data [2 ] ||
97+ (_data[2 ] == that._data [2 ] && (_data[3 ] > that._data [3 ]))))));
9198}
9299
93100inline std::array<std::array<bool , 16 >, 16 > BMat16::to_array () const noexcept {
94101 xpu64 tmp = to_block (_data);
95102 uint64_t a = tmp[0 ], b = tmp[1 ], c = tmp[2 ], d = tmp[3 ];
96103 std::array<std::array<bool , 16 >, 16 > res;
97104 for (size_t i = 0 ; i < 64 ; ++i) {
98- res[i/8 ][i%8 ] = a & 1 ; a >>= 1 ;
99- res[i/8 ][8 + i%8 ] = b & 1 ; b >>= 1 ;
100- res[8 + i/8 ][i%8 ] = c & 1 ; c >>= 1 ;
101- res[8 + i/8 ][8 + i%8 ] = d & 1 ; d >>= 1 ;
105+ res[i / 8 ][i % 8 ] = a & 1 ;
106+ a >>= 1 ;
107+ res[i / 8 ][8 + i % 8 ] = b & 1 ;
108+ b >>= 1 ;
109+ res[8 + i / 8 ][i % 8 ] = c & 1 ;
110+ c >>= 1 ;
111+ res[8 + i / 8 ][8 + i % 8 ] = d & 1 ;
112+ d >>= 1 ;
102113 }
103114 return res;
104115}
@@ -107,10 +118,10 @@ inline BMat16 BMat16::transpose_naive() const noexcept {
107118 uint64_t a = 0 , b = 0 , c = 0 , d = 0 ;
108119 for (int i = 7 ; i >= 0 ; --i) {
109120 for (int j = 7 ; j >= 0 ; --j) {
110- a = (a << 1 ) | (*this )(j, i);
111- b = (b << 1 ) | (*this )(j+ 8 , i);
112- c = (c << 1 ) | (*this )(j, i+ 8 );
113- d = (d << 1 ) | (*this )(j+ 8 , i+ 8 );
121+ a = (a << 1 ) | (*this )(j, i);
122+ b = (b << 1 ) | (*this )(j + 8 , i);
123+ c = (c << 1 ) | (*this )(j, i + 8 );
124+ d = (d << 1 ) | (*this )(j + 8 , i + 8 );
114125 }
115126 }
116127 return BMat16 (a, b, c, d);
@@ -119,25 +130,33 @@ inline BMat16 BMat16::transpose_naive() const noexcept {
119130inline BMat16 BMat16::transpose () const noexcept {
120131 xpu64 tmp = to_block (_data);
121132 xpu64 x = simde_mm256_set_epi64x (tmp[3 ], tmp[1 ], tmp[2 ], tmp[0 ]);
122- xpu64 y = (x ^ (x >> 7 )) & (xpu64{0xAA00AA00AA00AA , 0xAA00AA00AA00AA , 0xAA00AA00AA00AA , 0xAA00AA00AA00AA });
133+ xpu64 y = (x ^ (x >> 7 )) & (xpu64{0xAA00AA00AA00AA , 0xAA00AA00AA00AA ,
134+ 0xAA00AA00AA00AA , 0xAA00AA00AA00AA });
123135 x = x ^ y ^ (y << 7 );
124- y = (x ^ (x >> 14 )) & (xpu64{0xCCCC0000CCCC , 0xCCCC0000CCCC , 0xCCCC0000CCCC , 0xCCCC0000CCCC });
136+ y = (x ^ (x >> 14 )) &
137+ (xpu64{0xCCCC0000CCCC , 0xCCCC0000CCCC , 0xCCCC0000CCCC , 0xCCCC0000CCCC });
125138 x = x ^ y ^ (y << 14 );
126- y = (x ^ (x >> 28 )) & (xpu64{0xF0F0F0F0 , 0xF0F0F0F0 , 0xF0F0F0F0 , 0xF0F0F0F0 });
139+ y = (x ^ (x >> 28 )) &
140+ (xpu64{0xF0F0F0F0 , 0xF0F0F0F0 , 0xF0F0F0F0 , 0xF0F0F0F0 });
127141 x = x ^ y ^ (y << 28 );
128142 return BMat16 (to_line (x));
129143}
130144
131- static constexpr xpu16 rot{0x302 , 0x504 , 0x706 , 0x908 , 0xb0a , 0xd0c , 0xf0e , 0x100 , 0x302 , 0x504 , 0x706 , 0x908 , 0xb0a , 0xd0c , 0xf0e , 0x100 };
145+ static constexpr xpu16 rot{0x302 , 0x504 , 0x706 , 0x908 , 0xb0a , 0xd0c ,
146+ 0xf0e , 0x100 , 0x302 , 0x504 , 0x706 , 0x908 ,
147+ 0xb0a , 0xd0c , 0xf0e , 0x100 };
132148
133149inline BMat16 BMat16::mult_transpose (BMat16 const &that) const noexcept {
134150 xpu16 x = _data;
135151 xpu16 y1 = that._data ;
136- xpu16 y2 = simde_mm256_set_epi64x (that._data [1 ], that._data [0 ], that._data [3 ], that._data [2 ]);
152+ xpu16 y2 = simde_mm256_set_epi64x (that._data [1 ], that._data [0 ],
153+ that._data [3 ], that._data [2 ]);
137154 xpu16 zero = simde_mm256_setzero_si256 ();
138155 xpu16 data = simde_mm256_setzero_si256 ();
139- xpu16 diag1{0x1 , 0x2 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 , 0x100 , 0x200 , 0x400 , 0x800 , 0x1000 , 0x2000 , 0x4000 , 0x8000 };
140- xpu16 diag2{0x100 , 0x200 , 0x400 , 0x800 , 0x1000 , 0x2000 , 0x4000 , 0x8000 , 0x1 , 0x2 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 };
156+ xpu16 diag1{0x1 , 0x2 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 ,
157+ 0x100 , 0x200 , 0x400 , 0x800 , 0x1000 , 0x2000 , 0x4000 , 0x8000 };
158+ xpu16 diag2{0x100 , 0x200 , 0x400 , 0x800 , 0x1000 , 0x2000 , 0x4000 , 0x8000 ,
159+ 0x1 , 0x2 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 };
141160 for (size_t i = 0 ; i < 8 ; ++i) {
142161 data |= ((x & y1) != zero) & diag1;
143162 data |= ((x & y2) != zero) & diag2;
@@ -151,41 +170,51 @@ inline BMat16 BMat16::mult_transpose(BMat16 const &that) const noexcept {
151170
152171inline BMat16 BMat16::mult_4bmat8 (BMat16 const &that) const noexcept {
153172 BMat16 tmp = that.transpose ();
154- xpu64 t1 = to_block (_data),
155- t2 = to_block (tmp._data );
156- BMat8 a1 (t1[0 ]), b1 (t1[1 ]), c1 (t1[2 ]), d1 (t1[3 ]),
157- a2 (t2[0 ]), b2 (t2[1 ]), c2 (t2[2 ]), d2 (t2[3 ]);
158- return BMat16 ((a1.mult_transpose (a2) | b1.mult_transpose (b2)).to_int (),
159- (a1.mult_transpose (c2) | b1.mult_transpose (d2)).to_int (),
160- (c1.mult_transpose (a2) | d1.mult_transpose (b2)).to_int (),
173+ xpu64 t1 = to_block (_data), t2 = to_block (tmp._data );
174+ BMat8 a1 (t1[0 ]), b1 (t1[1 ]), c1 (t1[2 ]), d1 (t1[3 ]), a2 (t2[0 ]), b2 (t2[1 ]),
175+ c2 (t2[2 ]), d2 (t2[3 ]);
176+ return BMat16 ((a1.mult_transpose (a2) | b1.mult_transpose (b2)).to_int (),
177+ (a1.mult_transpose (c2) | b1.mult_transpose (d2)).to_int (),
178+ (c1.mult_transpose (a2) | d1.mult_transpose (b2)).to_int (),
161179 (c1.mult_transpose (c2) | d1.mult_transpose (d2)).to_int ());
162180}
163181
164182inline BMat16 BMat16::mult_naive (BMat16 const &that) const noexcept {
165183 uint64_t a = 0 , b = 0 , c = 0 , d = 0 ;
166184 for (int i = 7 ; i >= 0 ; --i) {
167185 for (int j = 7 ; j >= 0 ; --j) {
168- a <<= 1 ; b <<= 1 ; c <<= 1 ; d <<= 1 ;
186+ a <<= 1 ;
187+ b <<= 1 ;
188+ c <<= 1 ;
189+ d <<= 1 ;
169190 for (size_t k = 0 ; k < 8 ; ++k) {
170- a |= ((*this )(i, k) & that (k, j)) | ((*this )(i, k + 8 ) & that (k + 8 , j));
171- b |= ((*this )(i, k) & that (k, j + 8 )) | ((*this )(i, k + 8 ) & that (k + 8 , j + 8 ));
172- c |= ((*this )(i + 8 , k) & that (k, j)) | ((*this )(i + 8 , k + 8 ) & that (k + 8 , j));
173- d |= ((*this )(i + 8 , k) & that (k, j + 8 )) | ((*this )(i + 8 , k + 8 ) & that (k + 8 , j + 8 ));
191+ a |= ((*this )(i, k) & that (k, j)) |
192+ ((*this )(i, k + 8 ) & that (k + 8 , j));
193+ b |= ((*this )(i, k) & that (k, j + 8 )) |
194+ ((*this )(i, k + 8 ) & that (k + 8 , j + 8 ));
195+ c |= ((*this )(i + 8 , k) & that (k, j)) |
196+ ((*this )(i + 8 , k + 8 ) & that (k + 8 , j));
197+ d |= ((*this )(i + 8 , k) & that (k, j + 8 )) |
198+ ((*this )(i + 8 , k + 8 ) & that (k + 8 , j + 8 ));
174199 }
175200 }
176201 }
177202 return BMat16 (a, b, c, d);
178203}
179204
180205inline BMat16 BMat16::mult_naive_array (BMat16 const &that) const noexcept {
181- std::array<std::array<bool , 16 >, 16 > tab1 = to_array (), tab2 = that.to_array ();
206+ std::array<std::array<bool , 16 >, 16 > tab1 = to_array (),
207+ tab2 = that.to_array ();
182208 uint64_t a = 0 , b = 0 , c = 0 , d = 0 ;
183209 for (int i = 7 ; i >= 0 ; --i) {
184210 for (int j = 7 ; j >= 0 ; --j) {
185- a <<= 1 ; b <<= 1 ; c <<= 1 ; d <<= 1 ;
211+ a <<= 1 ;
212+ b <<= 1 ;
213+ c <<= 1 ;
214+ d <<= 1 ;
186215 for (size_t k = 0 ; k < 16 ; ++k) {
187- a |= tab1[i][k] & tab2[k][j];
188- b |= tab1[i][k] & tab2[k][j + 8 ];
216+ a |= tab1[i][k] & tab2[k][j];
217+ b |= tab1[i][k] & tab2[k][j + 8 ];
189218 c |= tab1[i + 8 ][k] & tab2[k][j];
190219 d |= tab1[i + 8 ][k] & tab2[k][j + 8 ];
191220 }
@@ -194,14 +223,15 @@ inline BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept {
194223 return BMat16 (a, b, c, d);
195224}
196225
197- inline size_t BMat16::nr_rows () const noexcept {
226+ inline size_t BMat16::nr_rows () const noexcept {
198227 size_t res = 0 ;
199228 for (size_t i = 0 ; i < 16 ; ++i)
200- if ((_data[i/ 4 ] << (16 * (i% 4 )) >> 48 ) != 0 )
229+ if ((_data[i / 4 ] << (16 * (i % 4 )) >> 48 ) != 0 )
201230 ++res;
202231 return res;
203232
204- // // Vectorized version which doesn't work due to the absence of popcnt in simde
233+ // // Vectorized version which doesn't work due to the absence of popcnt in
234+ // / simde
205235 // xpu16 tmp = _data, zero = simde_mm256_setzero_si256();
206236 // xpu16 x = (tmp != zero);
207237 // return simde_mm256_popcnt_epi16(x);
@@ -210,7 +240,7 @@ inline size_t BMat16::nr_rows() const noexcept{
210240inline std::vector<uint16_t > BMat16::rows () const {
211241 std::vector<uint16_t > rows;
212242 for (size_t i = 0 ; i < 16 ; ++i) {
213- uint16_t row_rev = (_data[i/ 4 ] << (16 * (3 - i% 4 )) >> 48 );
243+ uint16_t row_rev = (_data[i / 4 ] << (16 * (3 - i % 4 )) >> 48 );
214244
215245 // The row needs to be reversed
216246 uint16_t row = 0 ;
@@ -232,45 +262,80 @@ inline BMat16 BMat16::random() {
232262}
233263
234264static const constexpr std::array<xpu64, 16 > ROW_MASK16 = {
235- xpu16{0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
236- xpu16{0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
237- xpu16{0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
238- xpu16{0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
239- xpu16{0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
240- xpu16{0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
241- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
242- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
243- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 },
244- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 },
245- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 },
246- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 },
247- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 },
248- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 },
249- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 },
250- xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff }
251- };
265+ static_cast <xpu64>(
266+ xpu16{0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
267+ static_cast <xpu64>(
268+ xpu16{0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
269+ static_cast <xpu64>(
270+ xpu16{0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
271+ static_cast <xpu64>(
272+ xpu16{0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
273+ static_cast <xpu64>(
274+ xpu16{0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
275+ static_cast <xpu64>(
276+ xpu16{0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
277+ static_cast <xpu64>(
278+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
279+ static_cast <xpu64>(
280+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
281+ static_cast <xpu64>(
282+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 , 0 }),
283+ static_cast <xpu64>(
284+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 , 0 }),
285+ static_cast <xpu64>(
286+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 , 0 }),
287+ static_cast <xpu64>(
288+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 , 0 }),
289+ static_cast <xpu64>(
290+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 , 0 }),
291+ static_cast <xpu64>(
292+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 , 0 }),
293+ static_cast <xpu64>(
294+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff , 0 }),
295+ static_cast <xpu64>(
296+ xpu16{0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xffff })};
252297
253298static const constexpr std::array<xpu64, 16 > COL_MASK16 = {
254- xpu16{1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
255- xpu16{2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 },
256- xpu16{4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 },
257- xpu16{8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 },
258- xpu16{0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 },
259- xpu16{0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 },
260- xpu16{0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 },
261- xpu16{0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 },
262- xpu16{0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 },
263- xpu16{0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 },
264- xpu16{0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 },
265- xpu16{0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 },
266- xpu16{0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 },
267- xpu16{0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 },
268- xpu16{0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 },
269- xpu16{0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 }
270- };
299+ static_cast <xpu64>(xpu16{1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }),
300+ static_cast <xpu64>(xpu16{2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 }),
301+ static_cast <xpu64>(xpu16{4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 }),
302+ static_cast <xpu64>(xpu16{8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 }),
303+ static_cast <xpu64>(xpu16{0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 ,
304+ 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 }),
305+ static_cast <xpu64>(xpu16{0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 ,
306+ 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 , 0x20 }),
307+ static_cast <xpu64>(xpu16{0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 ,
308+ 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 , 0x40 }),
309+ static_cast <xpu64>(xpu16{0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 ,
310+ 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 }),
311+ static_cast <xpu64>(xpu16{0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 ,
312+ 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 , 0x100 ,
313+ 0x100 , 0x100 }),
314+ static_cast <xpu64>(xpu16{0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 ,
315+ 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 , 0x200 ,
316+ 0x200 , 0x200 }),
317+ static_cast <xpu64>(xpu16{0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 ,
318+ 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 , 0x400 ,
319+ 0x400 , 0x400 }),
320+ static_cast <xpu64>(xpu16{0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 ,
321+ 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 , 0x800 ,
322+ 0x800 , 0x800 }),
323+ static_cast <xpu64>(xpu16{0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 ,
324+ 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 , 0x1000 ,
325+ 0x1000 , 0x1000 , 0x1000 , 0x1000 }),
326+ static_cast <xpu64>(xpu16{0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 ,
327+ 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 , 0x2000 ,
328+ 0x2000 , 0x2000 , 0x2000 , 0x2000 }),
329+ static_cast <xpu64>(xpu16{0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 ,
330+ 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 , 0x4000 ,
331+ 0x4000 , 0x4000 , 0x4000 , 0x4000 }),
332+ static_cast <xpu64>(xpu16{0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 ,
333+ 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 , 0x8000 ,
334+ 0x8000 , 0x8000 , 0x8000 , 0x8000 })};
271335
272336inline BMat16 BMat16::random (size_t const dim) {
273- // TO DO : Instead of nulling all the cols/rows one by one, one could do that at once with the proper mask
337+ // TO DO : Instead of nulling all the cols/rows one by one, one could do
338+ // that at once with the proper mask
274339 HPCOMBI_ASSERT (0 < dim && dim <= 16 );
275340 BMat16 bm = BMat16::random ();
276341 for (size_t i = dim; i < 16 ; ++i) {
@@ -290,8 +355,7 @@ inline std::ostream &BMat16::write(std::ostream &os) const {
290355 return os;
291356}
292357
293-
294- } // namespace HPCombi
358+ } // namespace HPCombi
295359
296360namespace std {
297361
0 commit comments