CoCalc Public Filestmp / mzd.cOpen in with one click!
Authors: Harald Schilly, ℏal Snyder, William A. Stein
1
/******************************************************************************
2
*
3
* M4RI: Linear Algebra over GF(2)
4
*
5
* Copyright (C) 2007 Gregory Bard <[email protected]>
6
* Copyright (C) 2009-2013 Martin Albrecht <[email protected]>
7
* Copyright (C) 2011 Carlo Wood <[email protected]>
8
*
9
* Distributed under the terms of the GNU General Public License (GPL)
10
* version 2 or higher.
11
*
12
* This code is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* General Public License for more details.
16
*
17
* The full text of the GPL is available at:
18
*
19
* http://www.gnu.org/licenses/
20
******************************************************************************/
21
22
#ifdef HAVE_CONFIG_H
23
#include "config.h"
24
#endif
25
26
#ifdef __M4RI_HAVE_LIBPNG
27
#include <png.h>
28
#endif
29
30
#include <stdlib.h>
31
#include <string.h>
32
#include "mzd.h"
33
#include "parity.h"
34
#include "mmc.h"
35
36
37
/**
38
* \brief Cache of mzd_t containers
39
*/
40
41
typedef struct mzd_t_cache {
42
mzd_t mzd[64]; /*!< cached matrices */
43
struct mzd_t_cache *prev; /*!< previous block */
44
struct mzd_t_cache *next; /*!< next block */
45
uint64_t used; /*!< bitmasks which matrices in this block are used */
46
unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache*) - sizeof(uint64_t)]; /*!< alignment */
47
#ifdef __GNUC__
48
} mzd_t_cache_t __attribute__ ((__aligned__ (64)));
49
#else
50
} mzd_t_cache_t;
51
#endif
52
53
#define __M4RI_MZD_T_CACHE_MAX 16
54
static mzd_t_cache_t mzd_cache;
55
static mzd_t_cache_t* current_cache = &mzd_cache;
56
57
static int log2_floor(uint64_t v) {
58
static uint64_t const b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000 };
59
static unsigned int const S[] = { 1, 2, 4, 8, 16, 32 };
60
unsigned int r = 0;
61
for (int i = 5; i >= 0; --i)
62
{
63
if ((v & b[i]))
64
{
65
v >>= S[i];
66
r |= S[i];
67
}
68
}
69
return r;
70
}
71
72
/*
73
* Return a pointer to a new mzd_t structure.
74
* The structure will be 64 byte aligned.
75
* Call mzd_t_free to free the structure for next use.
76
*/
77
78
static mzd_t* mzd_t_malloc() {
79
#if __M4RI_ENABLE_MZD_CACHE == 0
80
return (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
81
#else
82
mzd_t *ret = NULL;
83
int i=0;
84
85
if (current_cache->used == (uint64_t)-1) {
86
mzd_t_cache_t *cache = &mzd_cache;
87
while (cache && cache->used == (uint64_t)-1) {
88
current_cache = cache;
89
cache = cache->next;
90
i++;
91
}
92
if (!cache && i< __M4RI_MZD_T_CACHE_MAX) {
93
cache = (mzd_t_cache_t*)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64);
94
memset((char*)cache, 0, sizeof(mzd_t_cache_t));
95
96
cache->prev = current_cache;
97
current_cache->next = cache;
98
current_cache = cache;
99
} else if (!cache && i>= __M4RI_MZD_T_CACHE_MAX) {
100
/* We have reached the upper limit on the number of caches */
101
ret = (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t));
102
} else {
103
current_cache = cache;
104
}
105
}
106
if (ret == NULL) {
107
int free_entry =log2_floor(~current_cache->used);
108
current_cache->used |= ((uint64_t)1 << free_entry);
109
ret = &current_cache->mzd[free_entry];
110
}
111
return ret;
112
#endif //__M4RI_ENABLE_MZD_CACHE
113
}
114
115
static void mzd_t_free(mzd_t *M) {
116
#if __M4RI_ENABLE_MZD_CACHE == 0
117
m4ri_mm_free(M);
118
#else
119
int foundit = 0;
120
mzd_t_cache_t *cache = &mzd_cache;
121
while(cache) {
122
size_t entry = M - cache->mzd;
123
if (entry < 64) {
124
cache->used &= ~((uint64_t)1 << entry);
125
if (cache->used == 0) {
126
if (cache == &mzd_cache) {
127
current_cache = cache;
128
} else {
129
if (cache == current_cache) {
130
current_cache = cache->prev;
131
}
132
cache->prev->next = cache->next;
133
if (cache->next)
134
cache->next->prev = cache->prev;
135
m4ri_mm_free(cache);
136
}
137
}
138
foundit = 1;
139
break;
140
}
141
cache = cache->next;
142
}
143
if(!foundit) {
144
m4ri_mm_free(M);
145
}
146
#endif //__M4RI_ENABLE_MZD_CACHE
147
}
148
149
mzd_t *mzd_init(rci_t r, rci_t c) {
150
assert(sizeof(mzd_t) == 64);
151
152
mzd_t *A = mzd_t_malloc();
153
154
A->nrows = r;
155
A->ncols = c;
156
A->width = (c + m4ri_radix - 1) / m4ri_radix;
157
A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1;
158
A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix);
159
A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
160
A->offset_vector = 0;
161
A->row_offset = 0;
162
163
A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here.
164
165
if (r && c) {
166
int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
167
A->blockrows_log = 0;
168
while(blockrows >>= 1)
169
A->blockrows_log++;
170
blockrows = 1 << A->blockrows_log;
171
172
int const blockrows_mask = blockrows - 1;
173
int const nblocks = (r + blockrows - 1) / blockrows;
174
A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0;
175
A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t));
176
177
size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride;
178
for(int i = nblocks - 1; i >= 0; --i) {
179
A->blocks[i].size = block_words * sizeof(word);
180
A->blocks[i].begin = (word*)m4ri_mmc_calloc(1, A->blocks[i].size);
181
A->blocks[i].end = A->blocks[i].begin + block_words;
182
block_words = blockrows * A->rowstride;
183
}
184
185
for(rci_t i = 0; i < A->nrows; ++i) {
186
A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & blockrows_mask) * A->rowstride;
187
}
188
189
} else {
190
A->blocks = NULL;
191
}
192
193
return A;
194
}
195
196
/*
197
Explanation of offset_vector (in words), and row_offset.
198
199
<------------------------------- row_stride (in words)--------------------->
200
.---------------------------------------------------------------------------. <-- m->blocks[0].begin ^
201
| ^ /| |
202
| m->row_offset| m->offset_vector_/ | |
203
| v / | |
204
| .--------------------------------------------------------------------v<--|---- m->rows[0] |_ skipped_blocks (in blocks)
205
| |m (also a window) ^ | | |
206
| | | | | |
207
`---------------------------------|-----------------------------------------' v
208
.---------------------------------|----------------------------------------_. <-- m->blocks[1].begin <-- windows.blocks[0].begin
209
| | ^ lowr| |_^ |
210
| | window->row_offset| | window->offset_vector _-^| |
211
| | v v _-^ | |
212
| | .----------------------------------------------------------v<--. |<--|---- m->rows[lowr]
213
| | |window | `-|---|---- window->rows[0]
214
| | | | | |
215
`---------------------------------------------------------------------------'
216
.---------------------------------------------------------------------------. <-- m->blocks[2].begin <-- windows.blocks[1].begin
217
| | | | | |
218
| | | | lowc | |
219
| | | |<---->| |
220
| | | | \__|___|__ also wrd_offset (in words)
221
| | `----------------------------------------------------------' | |
222
| `--------------------------------------------------------------------' |
223
`---------------------------------------------------------------------------'
224
.---------------------------------------------------------------------------.
225
| |
226
227
*/
228
229
mzd_t *mzd_init_window(mzd_t *M, rci_t lowr, rci_t lowc, rci_t highr, rci_t highc) {
230
assert(lowc % m4ri_radix == 0);
231
232
mzd_t *W = mzd_t_malloc();
233
234
rci_t nrows = MIN(highr - lowr, M->nrows - lowr);
235
rci_t ncols = highc - lowc;
236
237
W->nrows = nrows;
238
W->ncols = ncols;
239
W->rowstride = M->rowstride;
240
W->width = (ncols + m4ri_radix - 1) / m4ri_radix;
241
W->high_bitmask = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
242
243
W->flags = mzd_flag_windowed_zerooffset;
244
W->flags |= (ncols % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
245
W->blockrows_log = M->blockrows_log;
246
247
wi_t const blockrows_mask = (1 << W->blockrows_log) - 1;
248
int const skipped_blocks = (M->row_offset + lowr) >> W->blockrows_log;
249
assert(skipped_blocks == 0 || ((M->flags & mzd_flag_multiple_blocks)));
250
W->row_offset = (M->row_offset + lowr) & blockrows_mask;
251
W->blocks = &M->blocks[skipped_blocks];
252
wi_t const wrd_offset = lowc / m4ri_radix;
253
W->offset_vector = (M->offset_vector + wrd_offset) + (W->row_offset - M->row_offset) * W->rowstride;
254
if(nrows)
255
W->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*));
256
else
257
W->rows = NULL;
258
for(rci_t i = 0; i < nrows; ++i) {
259
W->rows[i] = M->rows[lowr + i] + wrd_offset;
260
}
261
if (mzd_row_to_block(W, nrows - 1) > 0)
262
W->flags |= M->flags & mzd_flag_multiple_blocks;
263
264
/* offset_vector is the distance from the start of the first block to the first word of the first row. */
265
assert(nrows == 0 || W->blocks[0].begin + W->offset_vector == W->rows[0]);
266
267
__M4RI_DD_MZD(W);
268
return W;
269
}
270
271
void mzd_free(mzd_t *A) {
272
if(A->rows)
273
m4ri_mmc_free(A->rows, (A->nrows + 1) * sizeof(word*));
274
if(mzd_owns_blocks(A)) {
275
int i;
276
for(i = 0; A->blocks[i].size; ++i) {
277
m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size);
278
}
279
m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t));
280
}
281
mzd_t_free(A);
282
}
283
284
void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) {
285
mzd_row_add_offset(M, destrow, sourcerow, 0);
286
}
287
288
void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) {
289
wi_t const startblock = coloffset / m4ri_radix;
290
word temp;
291
292
/* make sure to start clearing at coloffset */
293
if (coloffset%m4ri_radix) {
294
temp = M->rows[row][startblock];
295
temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset);
296
} else {
297
temp = 0;
298
}
299
M->rows[row][startblock] = temp;
300
for (wi_t i = startblock + 1; i < M->width; ++i) {
301
M->rows[row][i] = 0;
302
}
303
304
__M4RI_DD_ROW(M, row);
305
}
306
307
308
rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
309
rci_t startrow = startcol;
310
rci_t pivots = 0;
311
for (rci_t i = startcol; i < M->ncols ; ++i) {
312
for(rci_t j = startrow ; j < M->nrows; ++j) {
313
if (mzd_read_bit(M, j, i)) {
314
mzd_row_swap(M, startrow, j);
315
++pivots;
316
317
for(rci_t ii = full ? 0 : startrow + 1; ii < M->nrows; ++ii) {
318
if (ii != startrow) {
319
if (mzd_read_bit(M, ii, i)) {
320
mzd_row_add_offset(M, ii, startrow, i);
321
}
322
}
323
}
324
startrow = startrow + 1;
325
break;
326
}
327
}
328
}
329
330
__M4RI_DD_MZD(M);
331
__M4RI_DD_RCI(pivots);
332
return pivots;
333
}
334
335
rci_t mzd_echelonize_naive(mzd_t *M, int full) {
336
return mzd_gauss_delayed(M, 0, full);
337
}
338
339
/**
340
* Transpose a 64 x 64 matrix with width 1.
341
*
342
* \param dst First word of destination matrix.
343
* \param src First word of source matrix.
344
* \param rowstride_dst Rowstride of matrix dst.
345
* \param rowstride_src Rowstride of matrix src.
346
*
347
* Rows of both matrices are expected to fit exactly in a word (offset == 0)
348
* and lay entirely inside a single block.
349
*
350
* \note This function also works when dst == src.
351
*/
352
353
static inline void _mzd_copy_transpose_64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src)
354
{
355
/*
356
* m runs over the values:
357
* 0x00000000FFFFFFFF
358
* 0x0000FFFF0000FFFF
359
* 0x00FF00FF00FF00FF
360
* 0x0F0F0F0F0F0F0F0F
361
* 0x3333333333333333
362
* 0x5555555555555555,
363
* alternating j zeroes with j ones.
364
*
365
* Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner,
366
* this is the memory-model view, see the layout on http://m4ri.sagemath.org/doxygen/structmzd__t.html):
367
* ...[A1][B1][A0][B0]
368
* ...[C1][D1][C0][D0]
369
* . [A2][B2]
370
* . [C2][B2]
371
* . .
372
* .
373
* The following calulates the XOR between A and D,
374
* and subsequently applies that to A and D respectively,
375
* swapping A and D as a result.
376
* Therefore wk starts at the first row and then has rowstride
377
* added j times, running over the rows of A, then skips C
378
* by adding j * rowstride to continue with the next A below C.
379
*/
380
381
word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
382
wi_t j_rowstride_dst = rowstride_dst * 64;
383
wi_t j_rowstride_src = rowstride_src * 32;
384
word* const end = dst + j_rowstride_dst;
385
// We start with j = 32, and a one-time unrolled loop, where
386
// we copy from src and write the result to dst, swapping
387
// the two 32x32 corner matrices.
388
int j = 32;
389
j_rowstride_dst >>= 1;
390
word* RESTRICT wk = dst;
391
for (word const* RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) {
392
for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) {
393
word xor = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m;
394
*wk = *wks ^ (xor << j);
395
*(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor;
396
}
397
}
398
// Next we work in-place in dst and swap the corners of
399
// each of the last matrices, all in parallel, for all
400
// remaining values of j.
401
m ^= m << 16;
402
for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
403
j_rowstride_dst >>= 1;
404
for (wk = dst; wk < end; wk += j_rowstride_dst) {
405
for (int k = 0; k < j; ++k, wk += rowstride_dst) {
406
word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m;
407
*wk ^= xor << j;
408
*(wk + j_rowstride_dst) ^= xor;
409
}
410
}
411
}
412
}
413
414
/**
415
* Transpose two 64 x 64 matrix with width 1.
416
*
417
* \param dst1 First word of destination matrix 1.
418
* \param dst2 First word of destination matrix 2.
419
* \param src1 First word of source matrix 1.
420
* \param src2 First word of source matrix 2.
421
* \param rowstride_dst Rowstride of destination matrices.
422
* \param rowstride_src Rowstride of source matrices.
423
*
424
* Rows of all matrices are expected to fit exactly in a word (offset == 0)
425
* and lay entirely inside a single block.
426
*
427
* \note This function also works to transpose in-place.
428
*/
429
430
static inline void _mzd_copy_transpose_64x64_2(word* RESTRICT dst1, word* RESTRICT dst2, word const* RESTRICT src1, word const* RESTRICT src2, wi_t rowstride_dst, wi_t rowstride_src)
431
{
432
word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
433
wi_t j_rowstride_dst = rowstride_dst * 64;
434
wi_t j_rowstride_src = rowstride_src * 32;
435
word* const end = dst1 + j_rowstride_dst;
436
int j = 32;
437
word* RESTRICT wk[2];
438
word const* RESTRICT wks[2];
439
word xor[2];
440
441
j_rowstride_dst >>= 1;
442
wk[0] = dst1;
443
wk[1] = dst2;
444
wks[0] = src1;
445
wks[1] = src2;
446
447
do {
448
449
for (int k = 0; k < j; ++k) {
450
xor[0] = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m;
451
xor[1] = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m;
452
*wk[0] = *wks[0] ^ (xor[0] << j);
453
*wk[1] = *wks[1] ^ (xor[1] << j);
454
*(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0];
455
*(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1];
456
wk[0] += rowstride_dst;
457
wk[1] += rowstride_dst;
458
wks[0] += rowstride_src;
459
wks[1] += rowstride_src;
460
}
461
462
wk[0] += j_rowstride_dst;
463
wk[1] += j_rowstride_dst;
464
wks[0] += j_rowstride_src;
465
wks[1] += j_rowstride_src;
466
467
} while(wk[0] < end);
468
469
m ^= m << 16;
470
for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
471
472
j_rowstride_dst >>= 1;
473
wk[0] = dst1;
474
wk[1] = dst2;
475
476
do {
477
478
for (int k = 0; k < j; ++k) {
479
xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m;
480
xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m;
481
*wk[0] ^= xor[0] << j;
482
*wk[1] ^= xor[1] << j;
483
*(wk[0] + j_rowstride_dst) ^= xor[0];
484
*(wk[1] + j_rowstride_dst) ^= xor[1];
485
wk[0] += rowstride_dst;
486
wk[1] += rowstride_dst;
487
}
488
489
wk[0] += j_rowstride_dst;
490
wk[1] += j_rowstride_dst;
491
492
} while(wk[0] < end);
493
}
494
}
495
496
static unsigned char log2_ceil_table[64] = {
497
0, 1, 2, 2, 3, 3, 3, 3,
498
4, 4, 4, 4, 4, 4, 4, 4,
499
5, 5, 5, 5, 5, 5, 5, 5,
500
5, 5, 5, 5, 5, 5, 5, 5,
501
6, 6, 6, 6, 6, 6, 6, 6,
502
6, 6, 6, 6, 6, 6, 6, 6,
503
6, 6, 6, 6, 6, 6, 6, 6,
504
6, 6, 6, 6, 6, 6, 6, 6
505
};
506
507
static inline int log2_ceil(int n)
508
{
509
return log2_ceil_table[n - 1];
510
}
511
512
static word const transpose_mask[6] = {
513
0x5555555555555555ULL,
514
0x3333333333333333ULL,
515
0x0F0F0F0F0F0F0F0FULL,
516
0x00FF00FF00FF00FFULL,
517
0x0000FFFF0000FFFFULL,
518
0x00000000FFFFFFFFULL,
519
};
520
521
/**
522
* Transpose 64/j matrices of size jxj in parallel.
523
*
524
* Where j equals n rounded up to the nearest power of 2.
525
* The input array t must be of size j (containing the rows i of all matrices in t[i]).
526
*
527
* t[0..{j-1}] = [Al]...[A1][A0]
528
*
529
* \param t An array of j words.
530
* \param n The number of rows in each matrix.
531
*
532
* \return log2(j)
533
*/
534
535
static inline int _mzd_transpose_Nxjx64(word* RESTRICT t, int n)
536
{
537
int j = 1;
538
int mi = 0; // Index into the transpose_mask array.
539
540
while (j < n) // Don't swap with entirely undefined data (where [D] exists entirely of non-existant rows).
541
{
542
// Swap 64/j matrices of size jxj in 2j rows. Thus,
543
// <---- one word --->
544
// [Al][Bl]...[A0][B0]
545
// [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj.
546
// Then swap [A] and [D] in-place.
547
548
// m runs over the values in transpose_mask, so that at all
549
// times m exists of j zeroes followed by j ones, repeated.
550
word const m = transpose_mask[mi];
551
int k = 0; // Index into t[].
552
do {
553
// Run over all rows of [A] and [D].
554
for (int i = 0; i < j; ++i, ++k) {
555
// t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them.
556
word xor = ((t[k] >> j) ^ t[k + j]) & m;
557
t[k] ^= xor << j;
558
t[k + j] ^= xor;
559
}
560
k += j; // Skip [C].
561
} while (k < n); // Stop if we passed all valid input.
562
563
// Double the size of j and repeat this for the next 2j rows until all
564
// n rows have been swapped (possibly with non-existant rows).
565
j <<= 1;
566
++mi;
567
}
568
569
return mi;
570
}
571
572
/**
573
* Transpose a n x 64 matrix with width 1.
574
*
575
* \param dst First word of destination matrix.
576
* \param src First word of source matrix.
577
* \param rowstride_dst Rowstride of destination matrix.
578
* \param rowstride_src Rowstride of source matrix.
579
* \param n Number of rows in source matrix, must be less than 64.
580
*
581
* Rows of all matrices are expected have offset zero
582
* and lay entirely inside a single block.
583
*
584
* \note This function also works to transpose in-place.
585
*/
586
587
static inline void _mzd_copy_transpose_lt64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
588
{
589
// Preload the n input rows into level 1, using a minimum of cache lines (compact storage).
590
word t[64];
591
word const* RESTRICT wks = src;
592
int k;
593
for (k = 0; k < n; ++k) {
594
t[k] = *wks;
595
wks += rowstride_src;
596
}
597
if (n > 32) {
598
while (k < 64)
599
t[k++] = 0;
600
_mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1);
601
return;
602
}
603
int log2j = _mzd_transpose_Nxjx64(t, n);
604
// All output bits are now transposed, but still might need to be shifted in place.
605
// What we have now is 64/j matrices of size jxj. Thus,
606
// [Al]...[A1][A0], where l = 64/j - 1.
607
// while the actual output is:
608
// [A0]
609
// [A1]
610
// ...
611
// [Al]
612
word const m = __M4RI_LEFT_BITMASK(n);
613
word* RESTRICT wk = dst;
614
switch (log2j) {
615
case 5:
616
{
617
wi_t const j_rowstride_dst = 32 * rowstride_dst;
618
for (int k = 0; k < 32; ++k) {
619
wk[0] = t[k] & m;
620
wk[j_rowstride_dst] = (t[k] >> 32) & m;
621
wk += rowstride_dst;
622
}
623
break;
624
}
625
case 4:
626
{
627
wi_t const j_rowstride_dst = 16 * rowstride_dst;
628
for (int k = 0; k < 16; ++k) {
629
wk[0] = t[k] & m;
630
wk[j_rowstride_dst] = (t[k] >> 16) & m;
631
wk[2 * j_rowstride_dst] = (t[k] >> 32) & m;
632
wk[3 * j_rowstride_dst] = (t[k] >> 48) & m;
633
wk += rowstride_dst;
634
}
635
break;
636
}
637
case 3:
638
{
639
wi_t const j_rowstride_dst = 8 * rowstride_dst;
640
for (int k = 0; k < 8; ++k) {
641
wk[0] = t[k] & m;
642
wk[j_rowstride_dst] = (t[k] >> 8) & m;
643
wk[2 * j_rowstride_dst] = (t[k] >> 16) & m;
644
wk[3 * j_rowstride_dst] = (t[k] >> 24) & m;
645
wk[4 * j_rowstride_dst] = (t[k] >> 32) & m;
646
wk[5 * j_rowstride_dst] = (t[k] >> 40) & m;
647
wk[6 * j_rowstride_dst] = (t[k] >> 48) & m;
648
wk[7 * j_rowstride_dst] = (t[k] >> 56) & m;
649
wk += rowstride_dst;
650
}
651
break;
652
}
653
case 2:
654
{
655
wi_t const j_rowstride_dst = 4 * rowstride_dst;
656
for (int k = 0; k < 4; ++k) {
657
word* RESTRICT wk2 = wk;
658
word tk = t[k];
659
for (int i = 0; i < 2; ++i) {
660
wk2[0] = tk & m;
661
wk2[j_rowstride_dst] = (tk >> 4) & m;
662
wk2[2 * j_rowstride_dst] = (tk >> 8) & m;
663
wk2[3 * j_rowstride_dst] = (tk >> 12) & m;
664
wk2[4 * j_rowstride_dst] = (tk >> 16) & m;
665
wk2[5 * j_rowstride_dst] = (tk >> 20) & m;
666
wk2[6 * j_rowstride_dst] = (tk >> 24) & m;
667
wk2[7 * j_rowstride_dst] = (tk >> 28) & m;
668
wk2 += 8 * j_rowstride_dst;
669
tk >>= 32;
670
}
671
wk += rowstride_dst;
672
}
673
break;
674
}
675
case 1:
676
{
677
wi_t const j_rowstride_dst = 2 * rowstride_dst;
678
for (int k = 0; k < 2; ++k) {
679
word* RESTRICT wk2 = wk;
680
word tk = t[k];
681
for (int i = 0; i < 8; ++i) {
682
wk2[0] = tk & m;
683
wk2[j_rowstride_dst] = (tk >> 2) & m;
684
wk2[2 * j_rowstride_dst] = (tk >> 4) & m;
685
wk2[3 * j_rowstride_dst] = (tk >> 6) & m;
686
wk2 += 4 * j_rowstride_dst;
687
tk >>= 8;
688
}
689
wk += rowstride_dst;
690
}
691
break;
692
}
693
case 0:
694
{
695
word* RESTRICT wk2 = wk;
696
word tk = t[0];
697
for (int i = 0; i < 16; ++i) {
698
wk2[0] = tk & m;
699
wk2[rowstride_dst] = (tk >> 1) & m;
700
wk2[2 * rowstride_dst] = (tk >> 2) & m;
701
wk2[3 * rowstride_dst] = (tk >> 3) & m;
702
wk2 += 4 * rowstride_dst;
703
tk >>= 4;
704
}
705
break;
706
}
707
}
708
}
709
710
/**
711
* Transpose a 64 x n matrix with width 1.
712
*
713
* \param dst First word of destination matrix.
714
* \param src First word of source matrix.
715
* \param rowstride_dst Rowstride of destination matrix.
716
* \param rowstride_src Rowstride of source matrix.
717
* \param n Number of columns in source matrix, must be less than 64.
718
*
719
* Rows of all matrices are expected have offset zero
720
* and lay entirely inside a single block.
721
*
722
* \note This function also works to transpose in-place.
723
*/
724
725
static inline void _mzd_copy_transpose_64xlt64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n)
726
{
727
word t[64];
728
int log2j = log2_ceil(n);
729
word const* RESTRICT wks = src;
730
switch (log2j) {
731
case 6:
732
{
733
_mzd_copy_transpose_64x64(t, src, 1, rowstride_src);
734
word* RESTRICT wk = dst;
735
for (int k = 0; k < n; ++k) {
736
*wk = t[k];
737
wk += rowstride_dst;
738
}
739
return;
740
}
741
case 5:
742
{
743
wi_t const j_rowstride_src = 32 * rowstride_src;
744
for (int k = 0; k < 32; ++k) {
745
t[k] = wks[0] | (wks[j_rowstride_src] << 32);
746
wks += rowstride_src;
747
}
748
break;
749
}
750
case 4:
751
{
752
wi_t const j_rowstride_src = 16 * rowstride_src;
753
for (int k = 0; k < 16; ++k) {
754
t[k] = wks[0] | (wks[j_rowstride_src] << 16);
755
t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48);
756
wks += rowstride_src;
757
}
758
break;
759
}
760
case 3:
761
{
762
wi_t const j_rowstride_src = 8 * rowstride_src;
763
word tt;
764
for (int k = 0; k < 8; ++k) {
765
tt = wks[0] | (wks[j_rowstride_src] << 8);
766
t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24);
767
tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40);
768
t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56);
769
wks += rowstride_src;
770
t[k] |= tt;
771
}
772
break;
773
}
774
case 2:
775
{
776
word const* RESTRICT wks2 = wks + 60 * rowstride_src;
777
t[0] = wks2[0];
778
t[1] = wks2[rowstride_src];
779
t[2] = wks2[2 * rowstride_src];
780
t[3] = wks2[3 * rowstride_src];
781
for (int i = 0; i < 15; ++i) {
782
wks2 -= 4 * rowstride_src;
783
t[0] <<= 4;
784
t[1] <<= 4;
785
t[2] <<= 4;
786
t[3] <<= 4;
787
t[0] |= wks2[0];
788
t[1] |= wks2[rowstride_src];
789
t[2] |= wks2[2 * rowstride_src];
790
t[3] |= wks2[3 * rowstride_src];
791
}
792
break;
793
}
794
case 1:
795
{
796
wks += 62 * rowstride_src;
797
t[0] = wks[0];
798
t[1] = wks[rowstride_src];
799
for (int i = 0; i < 31; ++i) {
800
wks -= 2 * rowstride_src;
801
t[0] <<= 2;
802
t[1] <<= 2;
803
t[0] |= wks[0];
804
t[1] |= wks[rowstride_src];
805
}
806
break;
807
}
808
case 0:
809
{
810
word tt[2];
811
tt[0] = wks[0];
812
tt[1] = wks[rowstride_src];
813
for (int i = 2; i < 64; i += 2) {
814
wks += 2 * rowstride_src;
815
tt[0] |= wks[0] << i;
816
tt[1] |= wks[rowstride_src] << i;
817
}
818
*dst = tt[0] | (tt[1] << 1);
819
return;
820
}
821
}
822
int j = 1 << log2j;
823
_mzd_transpose_Nxjx64(t, j);
824
word* RESTRICT wk = dst;
825
for (int k = 0; k < n; ++k) {
826
*wk = t[k];
827
wk += rowstride_dst;
828
}
829
}
830
831
/**
832
* Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8.
833
*
834
* \param dst First word of destination matrix.
835
* \param src First word of source matrix.
836
* \param rowstride_dst Rowstride of destination matrix.
837
* \param rowstride_src Rowstride of source matrix.
838
* \param n Number of rows in source matrix, must be less than or equal 8.
839
* \param m Number of columns in source matrix, must be less than or equal 8.
840
*
841
* Rows of all matrices are expected to have offset zero
842
* and lay entirely inside a single block.
843
*
844
* \note This function also works to transpose in-place.
845
*/
846
847
static inline void _mzd_copy_transpose_le8xle8(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
848
{
849
int end = maxsize * 7;
850
word const* RESTRICT wks = src;
851
word w = *wks;
852
int shift = 0;
853
for (int i = 1; i < n; ++i) {
854
wks += rowstride_src;
855
shift += 8;
856
w |= (*wks << shift);
857
}
858
word mask = 0x80402010080402ULL;
859
word w7 = w >> 7;
860
shift = 7;
861
--m;
862
do {
863
word xor = (w ^ w7) & mask;
864
mask >>= 8;
865
w ^= (xor << shift);
866
shift += 7;
867
w7 >>= 7;
868
w ^= xor;
869
} while(shift < end);
870
word* RESTRICT wk = dst + m * rowstride_dst;
871
for (int shift = 8 * m; shift > 0; shift -= 8) {
872
*wk = (unsigned char)(w >> shift);
873
wk -= rowstride_dst;
874
}
875
*wk = (unsigned char)w;
876
}
877
878
/**
879
* Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16.
880
*
881
* \param dst First word of destination matrix.
882
* \param src First word of source matrix.
883
* \param rowstride_dst Rowstride of destination matrix.
884
* \param rowstride_src Rowstride of source matrix.
885
* \param n Number of rows in source matrix, must be less than or equal 16.
886
* \param m Number of columns in source matrix, must be less than or equal 16.
887
*
888
* Rows of all matrices are expected to have offset zero
889
* and lay entirely inside a single block.
890
*
891
* \note This function also works to transpose in-place.
892
*/
893
894
static inline void _mzd_copy_transpose_le16xle16(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize)
895
{
896
int end = maxsize * 3;
897
word const* RESTRICT wks = src;
898
word t[4];
899
int i = n;
900
do {
901
t[0] = wks[0];
902
if (--i == 0) {
903
t[1] = 0;
904
t[2] = 0;
905
t[3] = 0;
906
break;
907
}
908
t[1] = wks[rowstride_src];
909
if (--i == 0) {
910
t[2] = 0;
911
t[3] = 0;
912
break;
913
}
914
t[2] = wks[2 * rowstride_src];
915
if (--i == 0) {
916
t[3] = 0;
917
break;
918
}
919
t[3] = wks[3 * rowstride_src];
920
if (--i == 0)
921
break;
922
wks += 4 * rowstride_src;
923
for(int shift = 16;; shift += 16) {
924
t[0] |= (*wks << shift);
925
if (--i == 0)
926
break;
927
t[1] |= (wks[rowstride_src] << shift);
928
if (--i == 0)
929
break;
930
t[2] |= (wks[2 * rowstride_src] << shift);
931
if (--i == 0)
932
break;
933
t[3] |= (wks[3 * rowstride_src] << shift);
934
if (--i == 0)
935
break;
936
wks += 4 * rowstride_src;
937
}
938
} while(0);
939
word mask = 0xF0000F0000F0ULL;
940
int shift = 12;
941
word xor[4];
942
do {
943
xor[0] = (t[0] ^ (t[0] >> shift)) & mask;
944
xor[1] = (t[1] ^ (t[1] >> shift)) & mask;
945
xor[2] = (t[2] ^ (t[2] >> shift)) & mask;
946
xor[3] = (t[3] ^ (t[3] >> shift)) & mask;
947
mask >>= 16;
948
t[0] ^= (xor[0] << shift);
949
t[1] ^= (xor[1] << shift);
950
t[2] ^= (xor[2] << shift);
951
t[3] ^= (xor[3] << shift);
952
shift += 12;
953
t[0] ^= xor[0];
954
t[1] ^= xor[1];
955
t[2] ^= xor[2];
956
t[3] ^= xor[3];
957
} while(shift < end);
958
_mzd_transpose_Nxjx64(t, 4);
959
i = m;
960
word* RESTRICT wk = dst;
961
do {
962
wk[0] = (uint16_t)t[0];
963
if (--i == 0)
964
break;
965
wk[rowstride_dst] = (uint16_t)t[1];
966
if (--i == 0)
967
break;
968
wk[2 * rowstride_dst] = (uint16_t)t[2];
969
if (--i == 0)
970
break;
971
wk[3 * rowstride_dst] = (uint16_t)t[3];
972
if (--i == 0)
973
break;
974
wk += 4 * rowstride_dst;
975
for(int shift = 16;; shift += 16) {
976
wk[0] = (uint16_t)(t[0] >> shift);
977
if (--i == 0)
978
break;
979
wk[rowstride_dst] = (uint16_t)(t[1] >> shift);
980
if (--i == 0)
981
break;
982
wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift);
983
if (--i == 0)
984
break;
985
wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift);
986
if (--i == 0)
987
break;
988
wk += 4 * rowstride_dst;
989
}
990
} while(0);
991
}
992
993
/**
994
* Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32.
995
*
996
* \param dst First word of destination matrix.
997
* \param src First word of source matrix.
998
* \param rowstride_dst Rowstride of destination matrix.
999
* \param rowstride_src Rowstride of source matrix.
1000
* \param n Number of rows in source matrix, must be less than or equal 32.
1001
* \param m Number of columns in source matrix, must be less than or equal 32.
1002
*
1003
* Rows of all matrices are expected to have offset zero
1004
* and lay entirely inside a single block.
1005
*
1006
* \note This function also works to transpose in-place.
1007
*/
1008
1009
static inline void _mzd_copy_transpose_le32xle32(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
1010
{
1011
word const* RESTRICT wks = src;
1012
word t[16];
1013
int i = n;
1014
if (n > 16) {
1015
i -= 16;
1016
for (int j = 0; j < 16; ++j) {
1017
t[j] = *wks;
1018
wks += rowstride_src;
1019
}
1020
int j = 0;
1021
do {
1022
t[j++] |= (*wks << 32);
1023
wks += rowstride_src;
1024
} while(--i);
1025
} else {
1026
int j;
1027
for (j = 0; j < n; ++j) {
1028
t[j] = *wks;
1029
wks += rowstride_src;
1030
}
1031
for (; j < 16; ++j)
1032
t[j] = 0;
1033
}
1034
_mzd_transpose_Nxjx64(t, 16);
1035
int one_more = (m & 1);
1036
word* RESTRICT wk = dst;
1037
if (m > 16) {
1038
m -= 16;
1039
for (int j = 0; j < 16; j += 2) {
1040
*wk = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
1041
wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000);
1042
wk += 2 * rowstride_dst;
1043
}
1044
for (int j = 1; j < m; j += 2) {
1045
*wk = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000);
1046
wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000);
1047
wk += 2 * rowstride_dst;
1048
}
1049
if (one_more) {
1050
*wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000);
1051
}
1052
} else {
1053
for (int j = 1; j < m; j += 2) {
1054
*wk = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000);
1055
wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
1056
wk += 2 * rowstride_dst;
1057
}
1058
if (one_more) {
1059
*wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000);
1060
}
1061
}
1062
}
1063
1064
static inline void _mzd_copy_transpose_le64xle64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m)
1065
{
1066
word const* RESTRICT wks = src;
1067
word t[64];
1068
int k;
1069
for (k = 0; k < n; ++k) {
1070
t[k] = *wks;
1071
wks += rowstride_src;
1072
}
1073
while(k < 64)
1074
t[k++] = 0;
1075
_mzd_copy_transpose_64x64(t, t, 1, 1);
1076
word* RESTRICT wk = dst;
1077
for (int k = 0; k < m; ++k) {
1078
*wk = t[k];
1079
wk += rowstride_dst;
1080
}
1081
return;
1082
}
1083
1084
void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp);
1085
1086
mzd_t *_mzd_transpose(mzd_t *DST, mzd_t const *A) {
1087
assert(!mzd_is_windowed(DST) && !mzd_is_windowed(A));
1088
// We assume that there fit at least 64 rows in a block, if
1089
// that is the case then each block will contain a multiple
1090
// of 64 rows, since blockrows is a power of 2.
1091
assert(A->blockrows_log >= 6 && DST->blockrows_log >= 6);
1092
1093
rci_t nrows = A->nrows;
1094
rci_t ncols = A->ncols;
1095
rci_t maxsize = MAX(nrows, ncols);
1096
1097
word* RESTRICT fwd = mzd_first_row(DST);
1098
word const* RESTRICT fws = mzd_first_row(A);
1099
1100
if (maxsize >= 64) {
1101
1102
// This is the most non-intrusive way to deal with the case of multiple blocks.
1103
// Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily
1104
// reduce the speed for small matrices (up to 64x64) by 5 to 10%.
1105
int const multiple_blocks = (A->flags | DST->flags) & mzd_flag_multiple_blocks;
1106
if (__M4RI_UNLIKELY(multiple_blocks)) {
1107
word* RESTRICT non_register_fwd;
1108
word const* RESTRICT non_register_fws;
1109
rci_t non_register_nrows;
1110
rci_t non_register_ncols;
1111
_mzd_transpose_multiblock(DST, A, &non_register_fwd, &non_register_fws, &non_register_nrows, &non_register_ncols);
1112
fwd = non_register_fwd;
1113
fws = non_register_fws;
1114
nrows = non_register_nrows;
1115
ncols = non_register_ncols;
1116
}
1117
1118
if (nrows >= 64) {
1119
/*
1120
* This is an interesting #if ...
1121
* I recommend to investigate the number of instructions, and the clocks per instruction,
1122
* as function of various sizes of the matrix (most likely especially the number of columns
1123
* (the size of a row) will have influence; also always use multiples of 64 or even 128),
1124
* for both cases below.
1125
*
1126
* To measure this run for example:
1127
*
1128
* ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000
1129
* ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240
1130
* etc (increase -x for smaller sizes to get better accuracy).
1131
*
1132
* --Carlo Wood
1133
*/
1134
#if 1
1135
int js = ncols & nrows & 64; // True if the total number of whole 64x64 matrices is odd.
1136
wi_t const rowstride_64_dst = 64 * DST->rowstride;
1137
word* RESTRICT fwd_current = fwd;
1138
word const* RESTRICT fws_current = fws;
1139
if (js) {
1140
js = 1;
1141
_mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
1142
if ((nrows | ncols) == 64) {
1143
__M4RI_DD_MZD(DST);
1144
return DST;
1145
}
1146
fwd_current += rowstride_64_dst;
1147
++fws_current;
1148
}
1149
rci_t const whole_64cols = ncols / 64;
1150
// The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
1151
// This way it can be inlined without duplicating the amount of code that has to be loaded.
1152
word* RESTRICT fwd_delayed = NULL;
1153
word const* RESTRICT fws_delayed = NULL;
1154
int even = 0;
1155
while (1)
1156
{
1157
for (int j = js; j < whole_64cols; ++j) {
1158
if (!even) {
1159
fwd_delayed = fwd_current;
1160
fws_delayed = fws_current;
1161
} else {
1162
_mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
1163
}
1164
fwd_current += rowstride_64_dst;
1165
++fws_current;
1166
even = !even;
1167
}
1168
nrows -= 64;
1169
if (ncols % 64) {
1170
_mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
1171
}
1172
fwd += 1;
1173
fws += 64 * A->rowstride;
1174
if (nrows < 64)
1175
break;
1176
js = 0;
1177
fws_current = fws;
1178
fwd_current = fwd;
1179
}
1180
#else
1181
// The same as the above, but without using _mzd_copy_transpose_64x64_2.
1182
wi_t const rowstride_64_dst = 64 * DST->rowstride;
1183
rci_t const whole_64cols = ncols / 64;
1184
assert(nrows >= 64);
1185
do {
1186
for (int j = 0; j < whole_64cols; ++j) {
1187
_mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride, A->rowstride);
1188
}
1189
nrows -= 64;
1190
if (ncols % 64) {
1191
_mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64);
1192
}
1193
fwd += 1;
1194
fws += 64 * A->rowstride;
1195
} while(nrows >= 64);
1196
#endif
1197
}
1198
1199
if (nrows == 0) {
1200
__M4RI_DD_MZD(DST);
1201
return DST;
1202
}
1203
1204
// Transpose the remaining top rows. Now 0 < nrows < 64.
1205
1206
while (ncols >= 64)
1207
{
1208
_mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrows);
1209
ncols -= 64;
1210
fwd += 64 * DST->rowstride;
1211
fws += 1;
1212
}
1213
1214
if (ncols == 0) {
1215
__M4RI_DD_MZD(DST);
1216
return DST;
1217
}
1218
1219
maxsize = MAX(nrows, ncols);
1220
}
1221
1222
// Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64.
1223
1224
if (maxsize <= 8) {
1225
_mzd_copy_transpose_le8xle8(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
1226
}
1227
else if (maxsize <= 16) {
1228
_mzd_copy_transpose_le16xle16(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize);
1229
}
1230
else if (maxsize <= 32) {
1231
_mzd_copy_transpose_le32xle32(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
1232
}
1233
else {
1234
_mzd_copy_transpose_le64xle64(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols);
1235
}
1236
1237
__M4RI_DD_MZD(DST);
1238
return DST;
1239
}
1240
1241
void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp) {
1242
1243
rci_t nrows = A->nrows;
1244
rci_t ncols = A->ncols;
1245
1246
rci_t blockrows_dst = 1 << DST->blockrows_log; // The maximum number of rows in a block of DST.
1247
rci_t blockrows_src = 1 << A->blockrows_log; // The maximum number of rows in a block of A.
1248
1249
/* We're deviding the source matrix into blocks of multiples of 64x64, such that each
1250
* block fits entirely inside a single memory allocation block, both in the source
1251
* as well as the corresponding destination.
1252
*
1253
* <-------------------ncols----------------->
1254
* <---------blockrows_dst------->
1255
* .---------------------------------------------------------------.
1256
* |P ^ Matrix A:| . |Q . . . |<-^---- A->blocks[0].begin
1257
* | | | . | . . . | |
1258
* | | | . | . . . | |
1259
* | | |- - - - - -|- - - - - - - - - - - - - - - -| |
1260
* | | | . | . ^ . . | |
1261
* | | | . | .<64x64>. . | |
1262
* | | | . | . v . . | |
1263
* | | |- - - - - -|- - - - - - - - - - - - - - - -| |- blockrows_src
1264
* | | | . | . . . | |
1265
* | | | . | . . . | |
1266
* | | | . | . . . | |
1267
* | |nrows |- - - - - -|- - - - - - - - - - - - - - - -| |
1268
* | | | . | . . . | |
1269
* | | | . | . . . | |
1270
* | | | . | . . . | v
1271
* |===================+===========================================|
1272
* |R | | . |S . . . |<------ A->blocks[1].begin
1273
* | | | . | . . . |
1274
* | | | . | . . . |
1275
* | | |- - - - - -|- - - - - - - - - - - - - - - -|
1276
* | | | . | . . . |
1277
* | | | . | . . . |
1278
* | | | . | . . . |
1279
* | | |- - - - - -|- - - - - - - - - - - - - - - -|
1280
* | v | . | . . . |
1281
* | `-------------------------------------------|
1282
* | | |
1283
* | | |
1284
* | | |
1285
* | | |
1286
* | | |
1287
* `---------------------------------------------------------------'
1288
*
1289
* Imagine this also to be the memory map of DST, which then would be
1290
* mirrored in the diagonal line from the top/right to the bottom/left.
1291
* Then each of the squares P, Q, R and S lay entirely inside one
1292
* memory block in both the source as well as the destination matrix.
1293
* P and Q are really the same block for matrix A (as are R and S),
1294
* while P and R (and Q and S) are really the same block for DST.
1295
*
1296
* We're going to run over the top/right corners of each of these
1297
* memory "blocks" and then transpose it, one by one, running
1298
* from right to left and top to bottom. The last one (R) will be
1299
* done by the calling function, so we just return when we get there.
1300
*/
1301
1302
rci_t R_top = (nrows >> A->blockrows_log) << A->blockrows_log;
1303
rci_t R_right = (ncols >> DST->blockrows_log) << DST->blockrows_log;
1304
for (rci_t col = 0; col < ncols; col += blockrows_dst) {
1305
rci_t end = (col == R_right) ? R_top : nrows;
1306
for (rci_t row = 0; row < end; row += blockrows_src) {
1307
1308
rci_t nrowsb = (row < R_top) ? blockrows_src : (nrows - R_top);
1309
rci_t ncolsb = (col < R_right) ? blockrows_dst : (ncols - R_right);
1310
word const* RESTRICT fws = mzd_row(A, row) + col / m4ri_radix;
1311
word* RESTRICT fwd = mzd_row(DST, col) + row / m4ri_radix;
1312
1313
// The following code is (almost) duplicated from _mzd_transpose.
1314
if (nrowsb >= 64) {
1315
1316
int js = ncolsb & nrowsb & 64; // True if the total number of whole 64x64 matrices is odd.
1317
wi_t const rowstride_64_dst = 64 * DST->rowstride;
1318
word* RESTRICT fwd_current = fwd;
1319
word const* RESTRICT fws_current = fws;
1320
if (js) {
1321
js = 1;
1322
_mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride);
1323
fwd_current += rowstride_64_dst;
1324
++fws_current;
1325
}
1326
rci_t const whole_64cols = ncolsb / 64;
1327
// The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
1328
// This way it can be inlined without duplicating the amount of code that has to be loaded.
1329
word* RESTRICT fwd_delayed = NULL;
1330
word const* RESTRICT fws_delayed = NULL;
1331
int even = 0;
1332
while (1)
1333
{
1334
for (int j = js; j < whole_64cols; ++j) {
1335
if (!even) {
1336
fwd_delayed = fwd_current;
1337
fws_delayed = fws_current;
1338
} else {
1339
_mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride);
1340
}
1341
fwd_current += rowstride_64_dst;
1342
++fws_current;
1343
even = !even;
1344
}
1345
nrowsb -= 64;
1346
if (ncolsb % 64) {
1347
_mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncolsb % 64);
1348
}
1349
fwd += 1;
1350
fws += 64 * A->rowstride;
1351
if (nrowsb < 64)
1352
break;
1353
js = 0;
1354
fws_current = fws;
1355
fwd_current = fwd;
1356
}
1357
}
1358
1359
if (nrowsb == 0)
1360
continue;
1361
1362
// Transpose the remaining top rows. Now 0 < nrowsb < 64.
1363
1364
while (ncolsb >= 64)
1365
{
1366
_mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrowsb);
1367
ncolsb -= 64;
1368
fwd += 64 * DST->rowstride;
1369
fws += 1;
1370
}
1371
1372
// This is true because if it wasn't then nrowsb has to be 0 and we continued before already.
1373
assert(ncolsb == 0);
1374
}
1375
}
1376
1377
*nrowsp = nrows - R_top;
1378
*ncolsp = ncols - R_right;
1379
if (R_top < nrows)
1380
*fwsp = mzd_row(A, R_top) + R_right / m4ri_radix;
1381
if (R_right < ncols)
1382
*fwdp = mzd_row(DST, R_right) + R_top / m4ri_radix;
1383
}
1384
1385
mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
1386
if (DST == NULL) {
1387
DST = mzd_init( A->ncols, A->nrows );
1388
} else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) {
1389
m4ri_die("mzd_transpose: Wrong size for return matrix.\n");
1390
} else {
1391
/** it seems this is taken care of in the subroutines, re-enable if running into problems **/
1392
//mzd_set_ui(DST,0);
1393
}
1394
1395
if(A->nrows == 0 || A->ncols == 0)
1396
return mzd_copy(DST, A);
1397
1398
if (__M4RI_LIKELY(!mzd_is_windowed(DST) && !mzd_is_windowed(A)))
1399
return _mzd_transpose(DST, A);
1400
int A_windowed = mzd_is_windowed(A);
1401
if (A_windowed)
1402
A = mzd_copy(NULL, A);
1403
if (__M4RI_LIKELY(!mzd_is_windowed(DST)))
1404
_mzd_transpose(DST, A);
1405
else {
1406
mzd_t *D = mzd_init(DST->nrows, DST->ncols);
1407
_mzd_transpose(D, A);
1408
mzd_copy(DST, D);
1409
mzd_free(D);
1410
}
1411
if (A_windowed)
1412
mzd_free((mzd_t*)A);
1413
return DST;
1414
}
1415
1416
mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1417
if (C == NULL) {
1418
C = mzd_init(A->nrows, B->ncols);
1419
} else {
1420
if (C->nrows != A->nrows || C->ncols != B->ncols) {
1421
m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
1422
}
1423
}
1424
if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
1425
mzd_t *BT = mzd_transpose(NULL, B);
1426
_mzd_mul_naive(C, A, BT, 1);
1427
mzd_free (BT);
1428
} else {
1429
_mzd_mul_va(C, A, B, 1);
1430
}
1431
return C;
1432
}
1433
1434
mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1435
if (C->nrows != A->nrows || C->ncols != B->ncols) {
1436
m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
1437
}
1438
1439
if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */
1440
mzd_t *BT = mzd_transpose(NULL, B);
1441
_mzd_mul_naive(C, A, BT, 0);
1442
mzd_free (BT);
1443
} else {
1444
_mzd_mul_va(C, A, B, 0);
1445
}
1446
return C;
1447
}
1448
1449
mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) {
1450
wi_t eol;
1451
word *a, *b, *c;
1452
1453
if (clear) {
1454
word const mask_end = C->high_bitmask;
1455
/* improves performance on x86_64 but is not cross plattform */
1456
/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1457
for (rci_t i = 0; i < C->nrows; ++i) {
1458
wi_t j = 0;
1459
for (; j < C->width - 1; ++j) {
1460
C->rows[i][j] = 0;
1461
}
1462
C->rows[i][j] &= ~mask_end;
1463
}
1464
}
1465
1466
if(C->ncols % m4ri_radix) {
1467
eol = (C->width - 1);
1468
} else {
1469
eol = (C->width);
1470
}
1471
1472
word parity[64];
1473
for (int i = 0; i < 64; ++i) {
1474
parity[i] = 0;
1475
}
1476
wi_t const wide = A->width;
1477
int const blocksize = __M4RI_MUL_BLOCKSIZE;
1478
for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) {
1479
for (rci_t i = start; i < start + blocksize; ++i) {
1480
a = A->rows[i];
1481
c = C->rows[i];
1482
for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
1483
for (int k = 0; k < m4ri_radix; ++k) {
1484
b = B->rows[j + k];
1485
parity[k] = a[0] & b[0];
1486
for (wi_t ii = wide - 1; ii >= 1; --ii)
1487
parity[k] ^= a[ii] & b[ii];
1488
}
1489
c[j / m4ri_radix] ^= m4ri_parity64(parity);
1490
}
1491
1492
if (eol != C->width) {
1493
word const mask_end = C->high_bitmask;
1494
/* improves performance on x86_64 but is not cross plattform */
1495
/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1496
for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
1497
b = B->rows[m4ri_radix * eol + k];
1498
parity[k] = a[0] & b[0];
1499
for (wi_t ii = 1; ii < A->width; ++ii)
1500
parity[k] ^= a[ii] & b[ii];
1501
}
1502
c[eol] ^= m4ri_parity64(parity) & mask_end;
1503
}
1504
}
1505
}
1506
1507
for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) {
1508
a = A->rows[i];
1509
c = C->rows[i];
1510
for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
1511
for (int k = 0; k < m4ri_radix; ++k) {
1512
b = B->rows[j+k];
1513
parity[k] = a[0] & b[0];
1514
for (wi_t ii = wide - 1; ii >= 1; --ii)
1515
parity[k] ^= a[ii] & b[ii];
1516
}
1517
c[j/m4ri_radix] ^= m4ri_parity64(parity);
1518
}
1519
1520
if (eol != C->width) {
1521
word const mask_end = C->high_bitmask;
1522
/* improves performance on x86_64 but is not cross plattform */
1523
/* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
1524
for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
1525
b = B->rows[m4ri_radix * eol + k];
1526
parity[k] = a[0] & b[0];
1527
for (wi_t ii = 1; ii < A->width; ++ii)
1528
parity[k] ^= a[ii] & b[ii];
1529
}
1530
c[eol] ^= m4ri_parity64(parity) & mask_end;
1531
}
1532
}
1533
1534
__M4RI_DD_MZD(C);
1535
return C;
1536
}
1537
1538
mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) {
1539
if(clear)
1540
mzd_set_ui(C, 0);
1541
1542
rci_t const m = v->nrows;
1543
rci_t const n = v->ncols;
1544
1545
for(rci_t i = 0; i < m; ++i)
1546
for(rci_t j = 0; j < n; ++j)
1547
if (mzd_read_bit(v,i,j))
1548
mzd_combine(C,i,0, C,i,0, A,j,0);
1549
1550
__M4RI_DD_MZD(C);
1551
return C;
1552
}
1553
1554
void mzd_randomize(mzd_t *A) {
1555
wi_t const width = A->width - 1;
1556
word const mask_end = A->high_bitmask;
1557
for(rci_t i = 0; i < A->nrows; ++i) {
1558
for(wi_t j = 0; j < width; ++j)
1559
A->rows[i][j] = m4ri_random_word();
1560
A->rows[i][width] ^= (A->rows[i][width] ^ m4ri_random_word()) & mask_end;
1561
}
1562
1563
__M4RI_DD_MZD(A);
1564
}
1565
1566
void mzd_set_ui( mzd_t *A, unsigned int value) {
1567
word const mask_end = A->high_bitmask;
1568
1569
for (rci_t i = 0; i < A->nrows; ++i) {
1570
word *row = A->rows[i];
1571
for(wi_t j = 0; j < A->width - 1; ++j)
1572
row[j] = 0;
1573
row[A->width - 1] &= ~mask_end;
1574
}
1575
1576
if(value % 2 == 0) {
1577
__M4RI_DD_MZD(A);
1578
return;
1579
}
1580
1581
rci_t const stop = MIN(A->nrows, A->ncols);
1582
for (rci_t i = 0; i < stop; ++i) {
1583
mzd_write_bit(A, i, i, 1);
1584
}
1585
1586
__M4RI_DD_MZD(A);
1587
}
1588
1589
int mzd_equal(mzd_t const *A, mzd_t const *B) {
1590
if (A->nrows != B->nrows) return FALSE;
1591
if (A->ncols != B->ncols) return FALSE;
1592
if (A == B) return TRUE;
1593
1594
wi_t Awidth = A->width - 1;
1595
1596
for (rci_t i = 0; i < A->nrows; ++i) {
1597
for (wi_t j = 0; j < Awidth; ++j) {
1598
if (A->rows[i][j] != B->rows[i][j])
1599
return FALSE;
1600
}
1601
}
1602
1603
word const mask_end = A->high_bitmask;
1604
for (rci_t i = 0; i < A->nrows; ++i) {
1605
if (((A->rows[i][Awidth] ^ B->rows[i][Awidth]) & mask_end))
1606
return FALSE;
1607
}
1608
return TRUE;
1609
}
1610
1611
int mzd_cmp(mzd_t const *A, mzd_t const *B) {
1612
if(A->nrows < B->nrows) return -1;
1613
if(B->nrows < A->nrows) return 1;
1614
if(A->ncols < B->ncols) return -1;
1615
if(B->ncols < A->ncols) return 1;
1616
1617
const word mask_end = A->high_bitmask;
1618
const wi_t n = A->width-1;
1619
1620
/* Columns with large index are "larger", but rows with small index
1621
are more important than with large index. */
1622
1623
for(rci_t i=0; i<A->nrows; i++) {
1624
if ((A->rows[i][n]&mask_end) < (B->rows[i][n]&mask_end))
1625
return -1;
1626
else if ((A->rows[i][n]&mask_end) > (B->rows[i][n]&mask_end))
1627
return 1;
1628
1629
for(wi_t j=n-1; j>=0; j--) {
1630
if (A->rows[i][j] < B->rows[i][j])
1631
return -1;
1632
else if (A->rows[i][j] > B->rows[i][j])
1633
return 1;
1634
}
1635
}
1636
return 0;
1637
}
1638
1639
mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) {
1640
if (N == P)
1641
return N;
1642
1643
if (N == NULL) {
1644
N = mzd_init(P->nrows, P->ncols);
1645
} else {
1646
if (N->nrows < P->nrows || N->ncols < P->ncols)
1647
m4ri_die("mzd_copy: Target matrix is too small.");
1648
}
1649
word *p_truerow, *n_truerow;
1650
wi_t const wide = P->width - 1;
1651
word mask_end = P->high_bitmask;
1652
for (rci_t i = 0; i < P->nrows; ++i) {
1653
p_truerow = P->rows[i];
1654
n_truerow = N->rows[i];
1655
for (wi_t j = 0; j < wide; ++j)
1656
n_truerow[j] = p_truerow[j];
1657
n_truerow[wide] = (n_truerow[wide] & ~mask_end) | (p_truerow[wide] & mask_end);
1658
}
1659
__M4RI_DD_MZD(N);
1660
return N;
1661
}
1662
1663
/* This is sometimes called augment */
1664
mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1665
if (A->nrows != B->nrows) {
1666
m4ri_die("mzd_concat: Bad arguments to concat!\n");
1667
}
1668
1669
if (C == NULL) {
1670
C = mzd_init(A->nrows, A->ncols + B->ncols);
1671
} else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) {
1672
m4ri_die("mzd_concat: C has wrong dimension!\n");
1673
}
1674
1675
for (rci_t i = 0; i < A->nrows; ++i) {
1676
word *dst_truerow = C->rows[i];
1677
word *src_truerow = A->rows[i];
1678
for (wi_t j = 0; j < A->width; ++j) {
1679
dst_truerow[j] = src_truerow[j];
1680
}
1681
}
1682
1683
for (rci_t i = 0; i < B->nrows; ++i) {
1684
for (rci_t j = 0; j < B->ncols; ++j) {
1685
mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j));
1686
}
1687
}
1688
1689
__M4RI_DD_MZD(C);
1690
return C;
1691
}
1692
1693
mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1694
if (A->ncols != B->ncols) {
1695
m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols);
1696
}
1697
1698
if (C == NULL) {
1699
C = mzd_init(A->nrows + B->nrows, A->ncols);
1700
} else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) {
1701
m4ri_die("mzd_stack: C has wrong dimension!\n");
1702
}
1703
1704
for(rci_t i = 0; i < A->nrows; ++i) {
1705
word *src_truerow = A->rows[i];
1706
word *dst_truerow = C->rows[i];
1707
for (wi_t j = 0; j < A->width; ++j) {
1708
dst_truerow[j] = src_truerow[j];
1709
}
1710
}
1711
1712
for(rci_t i = 0; i < B->nrows; ++i) {
1713
word *dst_truerow = C->rows[A->nrows + i];
1714
word *src_truerow = B->rows[i];
1715
for (wi_t j = 0; j < B->width; ++j) {
1716
dst_truerow[j] = src_truerow[j];
1717
}
1718
}
1719
1720
__M4RI_DD_MZD(C);
1721
return C;
1722
}
1723
1724
mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) {
1725
mzd_t *H;
1726
1727
H = mzd_concat(NULL, A, I);
1728
1729
rci_t x = mzd_echelonize_naive(H, TRUE);
1730
1731
if (x == 0) {
1732
mzd_free(H);
1733
return NULL;
1734
}
1735
1736
INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols);
1737
1738
mzd_free(H);
1739
1740
__M4RI_DD_MZD(INV);
1741
return INV;
1742
}
1743
1744
mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) {
1745
if (left->nrows != right->nrows || left->ncols != right->ncols) {
1746
m4ri_die("mzd_add: rows and columns must match.\n");
1747
}
1748
if (ret == NULL) {
1749
ret = mzd_init(left->nrows, left->ncols);
1750
} else if (ret != left) {
1751
if (ret->nrows != left->nrows || ret->ncols != left->ncols) {
1752
m4ri_die("mzd_add: rows and columns of returned matrix must match.\n");
1753
}
1754
}
1755
return _mzd_add(ret, left, right);
1756
}
1757
1758
mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) {
1759
rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);
1760
1761
if (C == B) { //swap
1762
mzd_t const *tmp = A;
1763
A = B;
1764
B = tmp;
1765
}
1766
1767
word const mask_end = C->high_bitmask;
1768
1769
switch(A->width) {
1770
case 0:
1771
return C;
1772
case 1:
1773
for(rci_t i = 0; i < nrows; ++i) {
1774
C->rows[i][0] ^= ((A->rows[i][0] ^ B->rows[i][0] ^ C->rows[i][0]) & mask_end);
1775
}
1776
break;
1777
case 2:
1778
for(rci_t i = 0; i < nrows; ++i) {
1779
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1780
C->rows[i][1] ^= ((A->rows[i][1] ^ B->rows[i][1] ^ C->rows[i][1]) & mask_end);
1781
}
1782
break;
1783
case 3:
1784
for(rci_t i = 0; i < nrows; ++i) {
1785
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1786
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1787
C->rows[i][2] ^= ((A->rows[i][2] ^ B->rows[i][2] ^ C->rows[i][2]) & mask_end);
1788
}
1789
break;
1790
case 4:
1791
for(rci_t i = 0; i < nrows; ++i) {
1792
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1793
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1794
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1795
C->rows[i][3] ^= ((A->rows[i][3] ^ B->rows[i][3] ^ C->rows[i][3]) & mask_end);
1796
}
1797
break;
1798
case 5:
1799
for(rci_t i = 0; i < nrows; ++i) {
1800
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1801
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1802
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1803
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1804
C->rows[i][4] ^= ((A->rows[i][4] ^ B->rows[i][4] ^ C->rows[i][4]) & mask_end);
1805
}
1806
break;
1807
case 6:
1808
for(rci_t i = 0; i < nrows; ++i) {
1809
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1810
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1811
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1812
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1813
C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1814
C->rows[i][5] ^= ((A->rows[i][5] ^ B->rows[i][5] ^ C->rows[i][5]) & mask_end);
1815
}
1816
break;
1817
case 7:
1818
for(rci_t i = 0; i < nrows; ++i) {
1819
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1820
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1821
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1822
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1823
C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1824
C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
1825
C->rows[i][6] ^= ((A->rows[i][6] ^ B->rows[i][6] ^ C->rows[i][6]) & mask_end);
1826
}
1827
break;
1828
case 8:
1829
for(rci_t i = 0; i < nrows; ++i) {
1830
C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0];
1831
C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1];
1832
C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2];
1833
C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3];
1834
C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4];
1835
C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5];
1836
C->rows[i][6] = A->rows[i][6] ^ B->rows[i][6];
1837
C->rows[i][7] ^= ((A->rows[i][7] ^ B->rows[i][7] ^ C->rows[i][7]) & mask_end);
1838
}
1839
break;
1840
1841
default:
1842
for(rci_t i = 0; i < nrows; ++i) {
1843
mzd_combine_even(C,i,0, A,i,0, B,i,0);
1844
}
1845
}
1846
1847
__M4RI_DD_MZD(C);
1848
return C;
1849
}
1850
1851
mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol, rci_t const endrow, rci_t const endcol) {
1852
rci_t const nrows = endrow - startrow;
1853
rci_t const ncols = endcol - startcol;
1854
1855
if (S == NULL) {
1856
S = mzd_init(nrows, ncols);
1857
} else if( (S->nrows < nrows) | (S->ncols < ncols) ) {
1858
m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows, S->ncols, nrows, ncols);
1859
}
1860
1861
if (startcol % m4ri_radix == 0) {
1862
1863
wi_t const startword = startcol / m4ri_radix;
1864
/* we start at the beginning of a word */
1865
if(ncols / m4ri_radix != 0) {
1866
for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
1867
memcpy(S->rows[i], M->rows[x] + startword, sizeof(word) * (ncols / m4ri_radix));
1868
}
1869
}
1870
if (ncols % m4ri_radix) {
1871
word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
1872
for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
1873
/* process remaining bits */
1874
word temp = M->rows[x][startword + ncols / m4ri_radix] & mask_end;
1875
S->rows[i][ncols / m4ri_radix] = temp;
1876
}
1877
}
1878
} else {
1879
wi_t j;
1880
for(rci_t i=0; i<nrows; i++) {
1881
for(j=0; j+m4ri_radix<=ncols; j+=m4ri_radix)
1882
S->rows[i][j/m4ri_radix] = mzd_read_bits(M, startrow+i, startcol+j, m4ri_radix);
1883
S->rows[i][j/m4ri_radix] &= ~S->high_bitmask;
1884
S->rows[i][j/m4ri_radix] |= mzd_read_bits(M, startrow+i, startcol+j, ncols - j) & S->high_bitmask;
1885
}
1886
}
1887
__M4RI_DD_MZD(S);
1888
return S;
1889
}
1890
1891
void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb) {
1892
if (cola == colb)
1893
return;
1894
1895
rci_t const _cola = cola;
1896
rci_t const _colb = colb;
1897
1898
wi_t const a_word = _cola / m4ri_radix;
1899
wi_t const b_word = _colb / m4ri_radix;
1900
1901
int const a_bit = _cola % m4ri_radix;
1902
int const b_bit = _colb % m4ri_radix;
1903
1904
word* RESTRICT ptr = mzd_first_row(M);
1905
int max_bit = MAX(a_bit, b_bit);
1906
int count = mzd_rows_in_block(M, 0);
1907
assert(count > 0);
1908
int min_bit = a_bit + b_bit - max_bit;
1909
int block = 0;
1910
int offset = max_bit - min_bit;
1911
word mask = m4ri_one << min_bit;
1912
1913
if (a_word == b_word) {
1914
while(1) {
1915
ptr += a_word;
1916
int fast_count = count / 4;
1917
int rest_count = count - 4 * fast_count;
1918
word xor[4];
1919
wi_t const rowstride = M->rowstride;
1920
while (fast_count--) {
1921
xor[0] = ptr[0];
1922
xor[1] = ptr[rowstride];
1923
xor[2] = ptr[2 * rowstride];
1924
xor[3] = ptr[3 * rowstride];
1925
xor[0] ^= xor[0] >> offset;
1926
xor[1] ^= xor[1] >> offset;
1927
xor[2] ^= xor[2] >> offset;
1928
xor[3] ^= xor[3] >> offset;
1929
xor[0] &= mask;
1930
xor[1] &= mask;
1931
xor[2] &= mask;
1932
xor[3] &= mask;
1933
xor[0] |= xor[0] << offset;
1934
xor[1] |= xor[1] << offset;
1935
xor[2] |= xor[2] << offset;
1936
xor[3] |= xor[3] << offset;
1937
ptr[0] ^= xor[0];
1938
ptr[rowstride] ^= xor[1];
1939
ptr[2 * rowstride] ^= xor[2];
1940
ptr[3 * rowstride] ^= xor[3];
1941
ptr += 4 * rowstride;
1942
}
1943
while (rest_count--) {
1944
word xor = *ptr;
1945
xor ^= xor >> offset;
1946
xor &= mask;
1947
*ptr ^= xor | (xor << offset);
1948
ptr += rowstride;
1949
}
1950
if ((count = mzd_rows_in_block(M, ++block)) <= 0)
1951
break;
1952
ptr = mzd_first_row_next_block(M, block);
1953
}
1954
} else {
1955
word* RESTRICT min_ptr;
1956
wi_t max_offset;
1957
if (min_bit == a_bit) {
1958
min_ptr = ptr + a_word;
1959
max_offset = b_word - a_word;
1960
} else {
1961
min_ptr = ptr + b_word;
1962
max_offset = a_word - b_word;
1963
}
1964
while(1) {
1965
wi_t const rowstride = M->rowstride;
1966
while(count--) {
1967
word xor = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
1968
min_ptr[0] ^= xor;
1969
min_ptr[max_offset] ^= xor << offset;
1970
min_ptr += rowstride;
1971
}
1972
if ((count = mzd_rows_in_block(M, ++block)) <= 0)
1973
break;
1974
ptr = mzd_first_row_next_block(M, block);
1975
if (min_bit == a_bit)
1976
min_ptr = ptr + a_word;
1977
else
1978
min_ptr = ptr + b_word;
1979
}
1980
}
1981
1982
__M4RI_DD_MZD(M);
1983
}
1984
1985
int mzd_is_zero(mzd_t const *A) {
1986
word status = 0;
1987
word mask_end = A->high_bitmask;
1988
for (rci_t i = 0; i < A->nrows; ++i) {
1989
for (wi_t j = 0; j < A->width - 1; ++j)
1990
status |= A->rows[i][j];
1991
status |= A->rows[i][A->width - 1] & mask_end;
1992
if(status)
1993
return 0;
1994
}
1995
return !status;
1996
}
1997
1998
void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) {
1999
assert(B->ncols >= A->ncols);
2000
wi_t const width = MIN(B->width, A->width) - 1;
2001
2002
word const *a = A->rows[j];
2003
word *b = B->rows[i];
2004
2005
word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
2006
2007
if (width != 0) {
2008
for(wi_t k = 0; k < width; ++k)
2009
b[k] = a[k];
2010
b[width] = (b[width] & ~mask_end) | (a[width] & mask_end);
2011
2012
} else {
2013
b[0] = b[0] | (a[0]& mask_end) | (b[0] & ~mask_end);
2014
}
2015
2016
__M4RI_DD_ROW(B, i);
2017
}
2018
2019
2020
int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) {
2021
rci_t const nrows = A->nrows;
2022
rci_t const ncols = A->ncols;
2023
word data = 0;
2024
rci_t row_candidate = 0;
2025
if(A->ncols - start_col < m4ri_radix) {
2026
for(rci_t j = start_col; j < A->ncols; j += m4ri_radix) {
2027
int const length = MIN(m4ri_radix, ncols - j);
2028
for(rci_t i = start_row; i < nrows; ++i) {
2029
word const curr_data = mzd_read_bits(A, i, j, length);
2030
if (m4ri_lesser_LSB(curr_data, data)) {
2031
row_candidate = i;
2032
data = curr_data;
2033
}
2034
}
2035
if(data) {
2036
*r = row_candidate;
2037
for(int l = 0; l < length; ++l) {
2038
if(__M4RI_GET_BIT(data, l)) {
2039
*c = j + l;
2040
break;
2041
}
2042
}
2043
__M4RI_DD_RCI(*r);
2044
__M4RI_DD_RCI(*c);
2045
__M4RI_DD_INT(1);
2046
return 1;
2047
}
2048
}
2049
} else {
2050
/* we definitely have more than one word */
2051
/* handle first word */
2052
int const bit_offset = (start_col % m4ri_radix);
2053
wi_t const word_offset = start_col / m4ri_radix;
2054
word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-bit_offset);
2055
for(rci_t i = start_row; i < nrows; ++i) {
2056
word const curr_data = A->rows[i][word_offset] & mask_begin;
2057
if (m4ri_lesser_LSB(curr_data, data)) {
2058
row_candidate = i;
2059
data = curr_data;
2060
if(__M4RI_GET_BIT(data,bit_offset)) {
2061
break;
2062
}
2063
}
2064
}
2065
if(data) {
2066
*r = row_candidate;
2067
data >>= bit_offset;
2068
assert(data);
2069
for(int l = 0; l < (m4ri_radix - bit_offset); ++l) {
2070
if(__M4RI_GET_BIT(data, l)) {
2071
*c = start_col + l;
2072
break;
2073
}
2074
}
2075
__M4RI_DD_RCI(*r);
2076
__M4RI_DD_RCI(*c);
2077
__M4RI_DD_INT(1);
2078
return 1;
2079
}
2080
/* handle complete words */
2081
for(wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) {
2082
for(rci_t i = start_row; i < nrows; ++i) {
2083
word const curr_data = A->rows[i][wi];
2084
if (m4ri_lesser_LSB(curr_data, data)) {
2085
row_candidate = i;
2086
data = curr_data;
2087
if(__M4RI_GET_BIT(data, 0))
2088
break;
2089
}
2090
}
2091
if(data) {
2092
*r = row_candidate;
2093
for(int l = 0; l < m4ri_radix; ++l) {
2094
if(__M4RI_GET_BIT(data, l)) {
2095
*c = wi * m4ri_radix + l;
2096
break;
2097
}
2098
}
2099
__M4RI_DD_RCI(*r);
2100
__M4RI_DD_RCI(*c);
2101
__M4RI_DD_INT(1);
2102
return 1;
2103
}
2104
}
2105
/* handle last word */
2106
int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix;
2107
word const mask_end = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix);
2108
wi_t wi = A->width - 1;
2109
for(rci_t i = start_row; i < nrows; ++i) {
2110
word const curr_data = A->rows[i][wi] & mask_end;
2111
if (m4ri_lesser_LSB(curr_data, data)) {
2112
row_candidate = i;
2113
data = curr_data;
2114
if(__M4RI_GET_BIT(data,0))
2115
break;
2116
}
2117
}
2118
if(data) {
2119
*r = row_candidate;
2120
for(int l = 0; l < end_offset; ++l) {
2121
if(__M4RI_GET_BIT(data, l)) {
2122
*c = wi * m4ri_radix + l;
2123
break;
2124
}
2125
}
2126
__M4RI_DD_RCI(*r);
2127
__M4RI_DD_RCI(*c);
2128
__M4RI_DD_INT(1);
2129
return 1;
2130
}
2131
}
2132
__M4RI_DD_RCI(*r);
2133
__M4RI_DD_RCI(*c);
2134
__M4RI_DD_INT(0);
2135
return 0;
2136
}
2137
2138
2139
#define MASK(c) (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1))
2140
#define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c))
2141
2142
static inline int m4ri_bitcount(word w) {
2143
uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w);
2144
n = COUNT(n, 0);
2145
n = COUNT(n, 1);
2146
n = COUNT(n, 2);
2147
n = COUNT(n, 3);
2148
n = COUNT(n, 4);
2149
n = COUNT(n, 5);
2150
return (int)n;
2151
}
2152
2153
2154
double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) {
2155
size_t count = 0;
2156
size_t total = 0;
2157
2158
if(A->width == 1) {
2159
for(rci_t i = r; i < A->nrows; ++i)
2160
for(rci_t j = c; j < A->ncols; ++j)
2161
if(mzd_read_bit(A, i, j))
2162
++count;
2163
return ((double)count)/(1.0 * A->ncols * A->nrows);
2164
}
2165
2166
if(res == 0)
2167
res = A->width / 100;
2168
if (res < 1)
2169
res = 1;
2170
2171
for(rci_t i = r; i < A->nrows; ++i) {
2172
word *truerow = A->rows[i];
2173
for(rci_t j = c; j < m4ri_radix; ++j)
2174
if(mzd_read_bit(A, i, j))
2175
++count;
2176
total += m4ri_radix;
2177
2178
for(wi_t j = MAX(1, c / m4ri_radix); j < A->width - 1; j += res) {
2179
count += m4ri_bitcount(truerow[j]);
2180
total += m4ri_radix;
2181
}
2182
for(int j = 0; j < A->ncols % m4ri_radix; ++j)
2183
if(mzd_read_bit(A, i, m4ri_radix * (A->ncols / m4ri_radix) + j))
2184
++count;
2185
total += A->ncols % m4ri_radix;
2186
}
2187
2188
return (double)count / total;
2189
}
2190
2191
double mzd_density(mzd_t const *A, wi_t res) {
2192
return _mzd_density(A, res, 0, 0);
2193
}
2194
2195
rci_t mzd_first_zero_row(mzd_t const *A) {
2196
word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
2197
wi_t const end = A->width - 1;
2198
word *row;
2199
2200
for(rci_t i = A->nrows - 1; i >= 0; --i) {
2201
row = A->rows[i];
2202
word tmp = row[0];
2203
for (wi_t j = 1; j < end; ++j)
2204
tmp |= row[j];
2205
tmp |= row[end] & mask_end;
2206
if(tmp) {
2207
__M4RI_DD_INT(i + 1);
2208
return i + 1;
2209
}
2210
}
2211
__M4RI_DD_INT(0);
2212
return 0;
2213
}
2214
2215
mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) {
2216
rci_t k = MIN(A->nrows, A->ncols);
2217
if (U == NULL)
2218
U = mzd_submatrix(NULL, A, 0, 0, k, k);
2219
else
2220
assert(U->nrows == k && U->ncols == k);
2221
for(rci_t i=1; i<U->nrows; i++) {
2222
for(wi_t j=0; j<i/m4ri_radix; j++) {
2223
U->rows[i][j] = 0;
2224
}
2225
if(i%m4ri_radix)
2226
mzd_clear_bits(U, i, (i/m4ri_radix)*m4ri_radix, i%m4ri_radix);
2227
}
2228
return U;
2229
}
2230
2231
mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) {
2232
rci_t k = MIN(A->nrows, A->ncols);
2233
if (L == NULL)
2234
L = mzd_submatrix(NULL, A, 0, 0, k, k);
2235
else
2236
assert(L->nrows == k && L->ncols == k);
2237
for(rci_t i=0; i<L->nrows-1; i++) {
2238
if(m4ri_radix - (i+1)%m4ri_radix)
2239
mzd_clear_bits(L, i, i+1, m4ri_radix - (i+1)%m4ri_radix);
2240
for(wi_t j=(i/m4ri_radix+1); j<L->width; j++) {
2241
L->rows[i][j] = 0;
2242
}
2243
}
2244
return L;
2245
}
2246