linear_system_banded_avx.c 10.1 KB
Newer Older
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <math.h>
#include "mesh.h"
#include "linear_system.h"
#include <stdlib.h>
#include <string.h>

#define CONMAX 12
void connectivity_insert(int *connectivity, int i, int j)
{
  for (int k = 0; k < CONMAX; ++k) {
    int *p = connectivity + CONMAX*i + k;
    if (*p == -1)
      *p = j;
    if (*p == j)
      return;
  }
  printf("ERROR : node %i has more than %i neighbours\n", i, CONMAX);
}

Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
20
void reverse_cuthill_mckee(Mesh *mesh, int *ordering)
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
{
  int *node_connectivity = malloc(sizeof(int)*mesh->n_nodes*CONMAX);
  for (int i = 0; i < mesh->n_nodes*CONMAX; ++i) {
    node_connectivity[i] = -1;
  }
  for (int i = 0; i < mesh->n_triangles; ++i) {
    int *tri = mesh->triangles + i*3;
    connectivity_insert(node_connectivity, tri[0], tri[1]);
    connectivity_insert(node_connectivity, tri[1], tri[0]);
    connectivity_insert(node_connectivity, tri[0], tri[2]);
    connectivity_insert(node_connectivity, tri[2], tri[0]);
    connectivity_insert(node_connectivity, tri[1], tri[2]);
    connectivity_insert(node_connectivity, tri[2], tri[1]);
  }
  int *node_degree = malloc(sizeof(int)*mesh->n_nodes);
  for (int i = 0; i < mesh->n_nodes; ++i) {
    node_degree[i] = 0;
    for (int j = 0; j < CONMAX; ++j) {
      if (node_connectivity[CONMAX*i+j] == -1)
        break;
      node_degree[i] += 1;
    }
  }
  int *queue = malloc(sizeof(int)*mesh->n_nodes);
  queue[0] = 0;
  for (int i = 0; i < mesh->n_nodes; ++i){
    ordering[i] = -1;
48
    if (node_degree[queue[0]] > node_degree[i] && node_degree[i] !=0)
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
49
50
51
52
53
54
55
56
57
      queue[0] = i;
  }
  int stage_start = 0;
  int stage_end = 1;
  int queue_end = 1;
  int id = 0;
  while(stage_start != stage_end) {
    for (int i = stage_start; i < stage_end; ++i) {
      int c = queue[i];
58
      ordering[c] = id++;
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
      for(int j = 0; j < node_degree[c]; ++j) {
        int o = node_connectivity[c*CONMAX+j];
        if (ordering[o] == -1) {
          ordering[o] = -2;
#if 1
          queue[queue_end++] = o;
#else
          int k = stage_end;
          while(k < queue_end && node_degree[queue[k]] < node_degree[o])
            k++;
          for (int l = queue_end; l > k; --l)
            queue[l] = queue[l-1];
          queue[k] = o;
          queue_end++;
#endif
        }
      }
    }
    stage_start = stage_end;
    stage_end = queue_end;
  }
80
81
82
83
  for (int i = 0; i < mesh->n_nodes; ++i)
    if(ordering[i] >= 0)
      ordering[i] = id-1-ordering[i];
  printf("id = %i\n", id);
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
84
85
86
87
88
  free(queue);
  free(node_degree);
  free(node_connectivity);
}

89
90
91
92
93
94
95
96
#if defined(__AVX2__)
#define PADDING 4
#elif defined(__AVX__)
#define PADDING 2
#else
#define PADDING 1
#endif

Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
97
98
99
100
101
102
#include <immintrin.h>
// y[from:to] += a*x[from:to]
// y and x must be 256-bit aligned
// memory should be allocated in consequence
static void avx_row_axpy(double a, double *__restrict__ x, double *__restrict__ y, int from, int to)
{
103
#if defined(__AVX2__)
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
104
105
106
107
108
109
110
111
112
113
  int pto = (to+3)/4*4;
  int pfrom = (from+3)/4*4;
  for (int i = from; i < pfrom; ++i)
    y[i] += a*x[i];
  __m256d ma = _mm256_set1_pd(a);
  for (int i = pfrom; i < pto; i+=4) {
    __m256d mx = _mm256_load_pd(x+i);
    __m256d my = _mm256_load_pd(y+i);
    _mm256_store_pd(y+i,_mm256_fmadd_pd(ma,mx,my));
  }
114
#elif defined(__AVX__)
115
116
117
118
  int pto = (to+1)/2*2;
  int pfrom = (from+1)/2*2;
  for (int i = from; i < pfrom; ++i)
    y[i] += a*x[i];
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
119
  __m128d ma = _mm_set1_pd(a);
120
  for (int i = pfrom; i < pto; i+=2) {
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
121
122
    __m128d mx = _mm_load_pd(x+i);
    __m128d my = _mm_load_pd(y+i);
123
124
125
126
127
128
#if defined(__FMA__)
    _mm_store_pd(y+i, _mm_fmadd_pd(ma,mx,my));
#else
    __m128d mz = _mm_mul_pd(ma, mx);
    _mm_store_pd(y+i, _mm_add_pd(my,mz));
#endif
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
129
130
131
132
133
134
135
  }
#else
  for (int i = from; i < to; ++i)
    y[i] += a*x[i];
#endif
}

Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
struct LinearSystemStruct{
  double *M;
  int *row_start;
  int *row_end;
  int *row_lu_end;
  double **rows;
  double *b;
  double *x;
  int *node_map;
  int n_fields;
  int n;
  Mesh *mesh;
  int *isfixed;
};

static int imin(int a, int b) {
  return a < b ? a : b;
}

static int imax(int a, int b) {
  return a > b ? a : b;
}
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
158

Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
159
static int LUPDecompose(LinearSystem *system){
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
160
  int i,j;
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
161
162
  int N = system->n;
  double **A = system->rows;
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
163
  for(i=0;i<N;i++){
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
164
165
    for(j=i+1;j<system->row_lu_end[i];j++){
      if(i < system->row_start[j] || A[j][i] == 0.)
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
166
167
        continue;
      A[j][i]/=A[i][i];
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
168
      avx_row_axpy(-A[j][i],A[i],A[j],i+1,system->row_end[i]);
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
169
170
    }
  }
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
171
  return(1);
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
172
173
}

Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
174
175
176
177
178
static void LUPSolve(LinearSystem *system){
  double *x = system->x;
  double *b= system->b;
  int N = system->n;
  double **A = system->rows;
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
179
180
181
  for(int i=0;i<N;i++){
    //x[i]=b[P[i]];
    x[i]=b[i];
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
182
    for(int k=system->row_start[i];k<i;k++)
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
183
184
185
      x[i]-=A[i][k]*x[k];
  }
  for(int i=N-1;i>=0;i--){
186
    for(int k=i+1;k<system->row_end[i] && k < N;k++){
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
187
      x[i]-=A[i][k]*x[k];
188
    }
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
189
190
191
192
    x[i]=x[i]/A[i][i];
  }
}

Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
193
194
195
196
197
198
static void linear_system_zero_row(LinearSystem *system, int i)
{
  for (int j = system->row_start[i]; j < system->row_end[i]; ++j) {
    system->rows[i][j] = 0.;
  }
}
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
199
200
201
202
203
204
205

LinearSystem *linear_system_new(Mesh *mesh, int n_fields, int n_boundaries, const StrongBoundary *boundaries)
{
  LinearSystem *system = malloc(sizeof(LinearSystem));
  system->n_fields = n_fields;
  system->mesh = mesh;
  system->node_map = malloc(sizeof(int)*mesh->n_nodes);
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
206
  reverse_cuthill_mckee(mesh, system->node_map);
207
208
209
210
211
212
213
214
215
  int n_used_nodes = 0;
  for (int i = 0; i < mesh->n_nodes; ++i)
    if (system->node_map[i] >= 0)
      n_used_nodes ++;

  system->n = n_used_nodes*system->n_fields;
  int *node_row_start = malloc(sizeof(int)*n_used_nodes);
  int *node_row_end = malloc(sizeof(int)*n_used_nodes);
  for (int i = 0; i < n_used_nodes; ++i) {
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
216
    node_row_end[i] = 0;
217
    node_row_start[i] = n_used_nodes;
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
  }
  for (int i = 0; i < mesh->n_triangles; ++i) {
    int *tri = mesh->triangles + i*3;
    for (int j = 0; j < 3; ++j) {
      int n0 = system->node_map[tri[j]];
      for (int k = 0; k < 3; ++k) {
        int n1 = system->node_map[tri[k]];
        if (node_row_start[n0] > n1)
          node_row_start[n0] = n1;
        if (node_row_end[n0] < n1)
          node_row_end[n0] = n1;
      }
    }
  }
  int total_size = 0;
  system->row_start = malloc(sizeof(int)*system->n);
  system->row_end = malloc(sizeof(int)*system->n);
  system->row_lu_end = malloc(sizeof(int)*system->n);
236
  for (int i = 0; i < n_used_nodes; ++i) {
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
237
238
    for (int j = 0; j < n_fields; ++j) {
      int k = i*n_fields +j;
239
240
      system->row_start[k] = (node_row_start[i]*n_fields)/PADDING*PADDING;
      system->row_end[k] = (node_row_end[i]*n_fields+n_fields+(PADDING-1))/PADDING*PADDING;
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
241
242
243
244
245
246
247
248
249
250
251
252
253
      system->row_lu_end[k] = k;
    }
  }
  for (int i = 0; i < system->n; ++i) {
    for (int j = system->row_start[i]; j < i; ++j) {
      if (system->row_lu_end[j] < i+1) system->row_lu_end[j] = i+1;
      if (system->row_end[i] < system->row_end[j]) system->row_end[i] = system->row_end[j];
    }
    total_size += system->row_end[i]-system->row_start[i];
  }
  free(node_row_start);
  free(node_row_end);
  system->M = _mm_malloc(sizeof(double)*total_size, 32);
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
254
  system->rows = malloc(sizeof(double*)*system->n);
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
255
  for (int i = 0; i < total_size; ++i)
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
256
    system->M[i] = 0;
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
257
258
259
  system->rows[0] = system->M;
  for (int i = 1; i < system->n; ++i){
    system->rows[i] = system->rows[i-1]+system->row_end[i-1]-system->row_start[i];
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  }
  system->b = malloc(sizeof(double)*system->n);
  system->x = malloc(sizeof(double)*system->n);
  system->isfixed = malloc(sizeof(int)*system->n);
  for (int i = 0; i < system->n; ++i) {
    system->isfixed[i] = 0;
  }
  for (int ibnd = 0; ibnd < n_boundaries; ++ibnd) {
    const StrongBoundary *bnd = boundaries + ibnd;
    int iphys;
    for (iphys = 0; iphys < mesh->n_phys; ++iphys) {
      if (strcmp(bnd->tag, mesh->phys_name[iphys]) == 0)
        break;
    }
274
275
276
277
    if (iphys == mesh->n_phys){
      printf("Boundary tag \"%s\" not found.\n", bnd->tag);
      continue;
    }
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
278
    for (int i = 0; i < mesh->phys_n_nodes[iphys]; ++i){
279
280
      if (system->node_map[mesh->phys_nodes[iphys][i]] >= 0)
        system->isfixed[system->node_map[mesh->phys_nodes[iphys][i]]*system->n_fields+bnd->field] = 1;
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
    }
  }
  return system;
}

void linear_system_add_to_matrix(LinearSystem *system, int el0, int el1, const double *local_matrix){
  int *tri0 = &system->mesh->triangles[el0*3];
  int *tri1 = &system->mesh->triangles[el1*3];
  int nf = system->n_fields;
  for (int i = 0; i < 3; ++i) {
    for (int inf = 0; inf < 3; ++inf) {
      int ii = system->node_map[tri0[i]]*nf + inf;
      for (int j = 0; j < 3; ++j) {
        for (int jnf = 0; jnf < nf; ++jnf) {
          int jj = system->node_map[tri1[j]]*nf + jnf;
          system->rows[ii][jj] += local_matrix[(inf*3+i)*nf*3+jnf*3+j];
        }
      }
    }
  }
}

void linear_system_add_to_rhs(LinearSystem *system, int el0, const double *local_vector)
{
  const int *tri = system->mesh->triangles + el0*3;
  int n_fields = system->n_fields;
  for (int i = 0; i < n_fields; ++i) {
    for (int j = 0; j < 3; ++j) {
      int m = system->node_map[tri[j]]*n_fields+i;
      system->b[m] += local_vector[i*3+j];
    }
  }
}

void linear_system_zero_matrix_and_rhs(LinearSystem *system)
{
  for (int i = 0; i < system->n; ++i){
    system->b[i] = 0.;
    linear_system_zero_row(system, i);
  }
}

void linear_system_solve(LinearSystem *system, double *solution){
  double **rows = system->rows;
  for (int i = 0; i < system->n; ++i){
    if (system->isfixed[i]) {
      linear_system_zero_row(system, i);
      rows[i][i] = 1.;
      system->b[i] = 0;
    }
  }
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
332
333
  LUPDecompose(system);
  LUPSolve(system);
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
334
335
  for (int i = 0; i < system->mesh->n_nodes; ++i){
    int ii = system->node_map[i];
336
337
    if (ii < 0)
      continue;
338
    for (int j = 0; j < system->n_fields; ++j){
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
339
      solution[i*system->n_fields+j] = system->x[ii*system->n_fields+j];
340
    }
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
341
342
343
344
345
346
347
348
349
  }
}

void linear_system_free(LinearSystem *system)
{
  free(system->b);
  free(system->x);
  free(system->M);
  free(system->rows);
Jonathan Lambrechts's avatar
tmp    
Jonathan Lambrechts committed
350
351
352
  free(system->row_start);
  free(system->row_end);
  free(system->row_lu_end);
Jonathan Lambrechts's avatar
Jonathan Lambrechts committed
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
  free(system->isfixed);
  free(system->node_map);
  free(system);
}

double linear_system_get_rhs_norm(LinearSystem *system)
{
  double norm = 0;
  for (int i = 0; i < system->n;++i)
    if (!system->isfixed[i])
      norm += system->b[i]*system->b[i];
  return sqrt(norm);
}

void initialize_linear_solver(int argc, char **argv){}