ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sse_kernels.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
27 #ifdef VIENNACL_WITH_OPENMP
28 #include <omp.h>
29 #endif
30 
31 #include <iostream>
32 #include <vector>
33 
34 //for std::min
35 #include <algorithm>
36 
38 
39 namespace viennacl
40 {
41  namespace linalg
42  {
43  namespace host_based
44  {
45  namespace detail
46  {
47 
48  // returns true if the matrix is hermitian (or real symmetric), false otherwise
49  template<typename ScalarType>
50  bool isHermitian(ScalarType ** const A, vcl_size_t n)
51  {
52  for (vcl_size_t i=0;i<n;i++)
53  for (vcl_size_t j=i;j<n;j++)
54  if (A[i][j] != conjIfComplex(A[j][i]))
55  return false;
56  return true;
57  }
58 
59  // returns the bandwidth of a hermitian (or real symmetric) matrix
60  template<typename ScalarType>
62  {
63  for (vcl_size_t i=n-1;i>=0;i--)
64  for (vcl_size_t j=0;j<n-i;j++)
65  if (A[i+j][j]!=ScalarType(0))
66  return 2*i+1;
67  return 0;
68  }
69 
70  // helper for tridiagonalizeBandedMatrix
71  // does a householder similarity transform to eliminate a range of nonzeros in a row of a hermitian matrix
72  template<typename ScalarType>
74  {
75  if (from>=to)
76  return;
77 
78  ScalarType norm=_nrm2(&A[row][row+from],to-from);
79 
80  if (norm != ScalarType(0))
81  {
82 
83  //pick the better of two reflectors, to 1 or -1
84  //this is wierd syntax that also works with std::complex
85  if (std::abs(A[row][row+from]-ScalarType(1))>std::abs(A[row][row+from]+ScalarType(1)))
86  norm=-norm;
87  for (vcl_size_t i=row+from;i<row+to;i++)
88  A[row][i]/=norm;
89  A[row][row+from]+=ScalarType(1);
90 
91  //apply the similarity transformation
92 
93  //left transformation
94  for (vcl_size_t j=row+1;j<row+width;j++)
95  {
96  ScalarType s=_dotc(to-from,&A[row][row+from],&A[j][row+from]);
97  s=-s/A[row][row+from];
98  _axpy(&A[row][row+from],&A[j][row+from],to-from,s);
99  }
100 
101  //conjugate householder reflector for right transformation
102  for (vcl_size_t i=row+from;i<row+to;i++)
103  A[row][i]=conjIfComplex(A[row][i]);
104 
105  //right transformation (cache aligned)
106  for (vcl_size_t i=0;i<width;i++)
107  ss[i]=ScalarType(0);
108  for (vcl_size_t i=from;i<to;i++)
109  _axpy(&A[row+i][row],ss,width,conjIfComplex(A[row][row+i]));
110  for (vcl_size_t i=0;i<width;i++)
111  ss[i]=-ss[i]/A[row][row+from];
112  for (vcl_size_t i=from;i<to;i++)
113  _axpy(ss,&A[row+i][row],width,A[row][row+i]);
114 
115  //clean up the householder reflector
116  for (vcl_size_t col=row+from;col<row+to;col++)
117  A[row][col]=conjIfComplex(A[col][row]);
118 
119  }
120  }
121 
122  // reduces a hermitian (or symmetric real) banded matrix to a hermitian (or symmetric real) tridiagonal matrix,
123  // using householder similarity transforms, so eigenvalues are preserved.
124  // bandwidth should be an odd integer, such as 3 for an already tridiagonal matrix
125  // based on http://www.netlib.org/lapack/lawnspdf/lawn208.pdf
126  template<typename ScalarType>
128  {
129  if (bandwidth<=3)
130  return;
131 
132  vcl_size_t belowDiagonal=(bandwidth-1)/2;
133  ScalarType *ss=new ScalarType[bandwidth+belowDiagonal];
134 
135  //eliminate and chase bulges where the elimination makes a bulge
136  vcl_size_t k=0;
137  for (;k<n-belowDiagonal;k++)
138  {
139 
140  //eliminate below the diagonal
141  eliminateHermitian(A,k,1,1+belowDiagonal,std::min(n-k,2*belowDiagonal+1),ss);
142 
143  //chase the bulge
144  for (vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal)
145  for (vcl_size_t i=0;i<belowDiagonal-1;i++)
146  eliminateHermitian(A,bulgeStart+i,belowDiagonal,std::min(n-bulgeStart-i,belowDiagonal*2-i),std::min(n-bulgeStart-i,bandwidth+belowDiagonal),ss);
147  }
148 
149  //eliminate beyond where elimination makes bulges
150  for (;k<n-2;k++)
151  eliminateHermitian(A,k,1,n-k,n-k,ss);
152 
153  delete [] ss;
154  }
155 
156  // reduces a hermitian (or symmetric real) matrix to a hermitian (or symmetric real) banded matrix with bandwidth 2*block_size+1
157  // using householder similarity transformations, so eigenvalues are preserved. reduceToBandedMatrix(A,1) reduces the matrix to tridiagonal
158  template<typename ScalarType>
160  {
161  ScalarType* norms=new ScalarType[block_size];
162  ScalarType* ss=new ScalarType[n];
163 
164  for (vcl_size_t k=0;k<n-block_size;k+=block_size)
165  {
166  for (vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
167  {
168 
169  //this is the same as the norm of the column, since it's hermetian
170  norms[bi]=_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size);
171 
172  if (norms[bi]!=ScalarType(0))
173  {
174 
175  //pick the better of two reflectors, to 1 or -1
176  //this is wierd syntax that also works with std::complex
177  if (std::abs(A[k+bi][k+bi+block_size]-ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+ScalarType(1)))
178  norms[bi]=-norms[bi];
179  for (vcl_size_t i=k+bi+block_size;i<n;i++)
180  A[k+bi][i]/=norms[bi];
181  A[k+bi][k+bi+block_size]+=ScalarType(1);
182 
183  // Apply transformation to remaining rows within the block
184  for (vcl_size_t j=k+bi+1;j<k+block_size;j++)
185  {
186  ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
187  s=-s/A[k+bi][k+bi+block_size];
188  _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
189  }
190  }
191  }
192 
193  //apply transformations from block to remaining rows and columns the block in parallel
194 
195  //left transformations
196  #ifdef VIENNACL_WITH_OPENMP
197  #pragma omp parallel for
198  for (int j=k+block_size;j<(int)n;j++)
199  #else
200  for (vcl_size_t j=k+block_size;j<n;j++)
201  #endif
202  {
203  for (vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
204  {
205  if (norms[bi]!=ScalarType(0))
206  {
207  ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
208  s=-s/A[k+bi][k+bi+block_size];
209  _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
210  }
211  }
212  }
213 
214  //conjugate householder reflectors for right transformations
215  for (vcl_size_t bi=0;bi<block_size;bi++)
216  for (vcl_size_t i=k+bi+block_size;i<n;i++)
217  A[k+bi][i]=conjIfComplex(A[k+bi][i]);
218 
219  //right transformations (cache aligned)
220  #ifdef VIENNACL_WITH_OPENMP
221  #pragma omp parallel for
222  for (int section=0;section<(int)num_threads;section++)
223  #else
224  for (vcl_size_t section=0;section<num_threads;section++)
225  #endif
226  {
227  vcl_size_t start=((n-k)*(section+0))/num_threads+k;
228  vcl_size_t end =((n-k)*(section+1))/num_threads+k;
229  vcl_size_t length=end-start;
230  for (vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
231  {
232  if (norms[bi]!=ScalarType(0))
233  {
234  for (vcl_size_t i=start;i<end;i++)
235  ss[i]=ScalarType(0);
236  for (vcl_size_t i=k+bi+block_size;i<n;i++)
237  _axpy(&A[i][start],ss+start,length,conjIfComplex(A[k+bi][i]));
238  for (vcl_size_t i=start;i<end;i++)
239  ss[i]=-ss[i]/A[k+bi][k+bi+block_size];
240  for (vcl_size_t i=k+bi+block_size;i<n;i++)
241  _axpy(ss+start,&A[i][start],length,A[k+bi][i]);
242  }
243  }
244  }
245 
246  //clean up householder reflectors
247  for (vcl_size_t row=k;row<k+block_size;row++)
248  for (vcl_size_t col=row+block_size;col<n;col++)
249  A[row][col]=conjIfComplex(A[col][row]);
250  }
251  delete [] norms;
252  delete [] ss;
253  }
254 
255  } //namespace detail
256 
265  template<typename ScalarType>
266  void inplace_tred2(ScalarType ** A, vcl_size_t n, vcl_size_t block_size = 1, vcl_size_t num_threads = 1)
267  {
268  if (!detail::isHermitian(A,n))
269  std::cerr << "ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
270 
271  // Don't touch the whole matrix if the bandwidth is already small.
272  // There's nothing numerically significant about n*4,
273  // it's just a point I chose to switch to assuming the matrix is full.
275  if (bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth)
277  else
278  {
279  detail::reduceHermitianToBandedMatrix(A,n,block_size,num_threads);
281  }
282  }
283 
294  template<typename ScalarType>
295  bool lu_factorize_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t * piv = NULL, vcl_size_t block_size = 8)
296  {
297  // Use a parallel "left-looking", row-operation-based, block Crout/Doolittle algorithm.
298  if (piv)
299  for (vcl_size_t i=0; i<m; i++)
300  piv[i]=i;
301  bool pivsign=true;
302 
303  // Outer loop.
304  for (vcl_size_t j=0; j<std::min(m,n); j+=block_size)
305  {
306  block_size=std::min(std::min(m-j,n-j),block_size);
307 
308  //do Gaussian elimination with partial pivoting in the block
309  //(in the first few columns of the matrix)
310  for (vcl_size_t bi=0;bi<block_size;bi++)
311  {
312  // Find pivot and exchange if necessary.
313  vcl_size_t p=j+bi;
314  if (piv)
315  {
316  for (vcl_size_t i=j+bi+1; i<m; i++)
317  if (std::abs(A[i][j+bi])>std::abs(A[p][j+bi]))
318  p=i;
319 
320  if (p!=j+bi)
321  {
322  for (vcl_size_t k=0; k<n; k++)
323  {
324  ScalarType t=A[p][k];
325  A[p][k]=A[j+bi][k];
326  A[j+bi][k]=t;
327  }
328 
329  //swap pivot vector
330  vcl_size_t k = piv[p];
331  piv[p] = piv[j+bi];
332  piv[j+bi] = k;
333  pivsign = !pivsign;
334  }
335  }
336 
337  //eliminate below the diagonal in the block
338  ScalarType elimVal=A[j+bi][j+bi];
339  if (elimVal==ScalarType(0))
340  {
341  //apply previous transformations from the block to the top of the submatrix
342  for (vcl_size_t row=j+1;row<j+bi;row++)
343  for (vcl_size_t bi_=0;bi_<row-j;bi_++)
344  if (A[row][j+bi_]!=ScalarType(0))
345  _axpy(&(A[j+bi_][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi_]);
346  return pivsign;
347  }
348  for (vcl_size_t row=j+bi+1;row<m;row++)
349  {
350  ScalarType multiplier=A[row][j+bi]/elimVal;
351  for (vcl_size_t col=j+bi;col<j+block_size;col++)
352  A[row][col]-=multiplier*A[j+bi][col];
353  A[row][j+bi]=multiplier;
354  }
355  }
356 
357  //at this point, the matrix looks something like this (if block size were 4)
358  //
359  //U U U U * * * *
360  //L U U U * * * *
361  //L L U U * * * *
362  //L L L U * * * *
363  //L L L L * * * *
364  //L L L L * * * *
365  //L L L L * * * *
366  //L L L L * * * *
367 
368  //apply previous transformations from the block to the top of the submatrix
369  for (vcl_size_t row=j+1;row<j+block_size;row++)
370  for (vcl_size_t bi=0;bi<row-j;bi++)
371  if (A[row][j+bi]!=ScalarType(0))
372  _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]);
373 
374  //at this point, the matrix looks something like this (if block size were 4)
375  //
376  //U U U U U U U U
377  //L U U U U U U U
378  //L L U U U U U U
379  //L L L U U U U U
380  //L L L L * * * *
381  //L L L L * * * *
382  //L L L L * * * *
383  //L L L L * * * *
384 
385  //apply previous transformations from the block in parallel to the rest of the submatrix
386  #ifdef VIENNACL_OPENMP
387  #pragma omp parallel for
388  for (int row=j+block_size;row<(int)m;row++)
389  #else
390  for (vcl_size_t row=j+block_size;row<m;row++)
391  #endif
392  for (vcl_size_t bi=0;bi<block_size;bi++)
393  if (A[row][j+bi]!=ScalarType(0))
394  _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]);
395  }
396  return pivsign;
397  }
398 
406  template<typename ScalarType>
407  std::vector<ScalarType> inplace_qr_col_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8)
408  {
409  std::vector<ScalarType> betas(std::min(m,n));
410  ScalarType* norms=new ScalarType[block_size];
411 
412  for (vcl_size_t k=0; k<std::min(m,n); k+=block_size)
413  {
414  block_size=std::min(std::min(m-k,n-k),block_size);
415 
416  for (vcl_size_t bi=0;bi<block_size;bi++)
417  {
418 
419  // Compute 2-norm of k+bi-th column below the diagonal
420  norms[bi]=_nrm2(&A[k+bi][k+bi],m-k-bi);
421 
422  if (norms[bi]!=ScalarType(0))
423  {
424  //pick the better of two reflectors, to 1 or -1,
425  //this is wierd syntax that also works with std::complex
426  if (std::abs(A[k+bi][k+bi]-ScalarType(1))>std::abs(A[k+bi][k+bi]+ScalarType(1)))
427  norms[bi]*=-1;
428  for (vcl_size_t i=k+bi;i<m;i++)
429  A[k+bi][i]/=norms[bi];
430  A[k+bi][k+bi]+=ScalarType(1);
431 
432  // Apply transformation to columns within the block
433  for (vcl_size_t j=k+bi+1; j<k+block_size; j++)
434  {
435  ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
436  s = -s/A[k+bi][k+bi];
437  _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s);
438  }
439  }
440  //temporarily store the diagonal value of R in betas
441  betas[k+bi]=-norms[bi];
442  }
443 
444  //apply transformations from block to remaining columns to the right of the block in parallel
445  #ifdef VIENNACL_OPENMP
446  #pragma omp parallel for
447  for (int j=k+block_size; j<(int)n; j++)
448  #else
449  for (vcl_size_t j=k+block_size; j<n; j++)
450  #endif
451  {
452  for (vcl_size_t bi=0;bi<block_size;bi++)
453  {
454  if (norms[bi]!=ScalarType(0))
455  {
456  ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
457  s = -s/A[k+bi][k+bi];
458  _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s);
459  }
460  }
461  }
462  }
463 
464  //normalize the householder reflectors and store the betas
465  for (vcl_size_t j=0;j<std::min(m,n);j++)
466  {
467  ScalarType beta=A[j][j];
468  for (vcl_size_t i=j+1;i<m;i++)
469  A[j][i]/=beta;
470  A[j][j]=betas[j];//R diagonal values were stored temporarily in betas
471  betas[j]=beta;
472  }
473 
474  delete [] norms;
475  return betas;
476  }
477 
486  template<typename ScalarType>
487  std::vector<ScalarType> inplace_qr_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8, vcl_size_t num_threads = 1)
488  {
489  std::vector<ScalarType> betas(std::min(m,n));
490  ScalarType* norms=new ScalarType[block_size];
491  ScalarType* ss=new ScalarType[n];
492 
493  //allocate O(m) memory for temporary column-major storage of the block for blas functions
494  ScalarType** block_cols=new ScalarType*[block_size];
495  for (vcl_size_t i=0;i<block_size;i++)
496  block_cols[i]=new ScalarType[m];
497 
498  for (vcl_size_t k=0; k<std::min(m,n); k+=block_size)
499  {
500  block_size=std::min(std::min(m-k,n-k),block_size);
501 
502  //copy the block to column-major storage for cache alignment (necessary for _nrm2)
503  for (vcl_size_t i=0;i<m-k;i++)
504  for (vcl_size_t bi=0;bi<block_size;bi++)
505  block_cols[bi][i]=A[k+i][k+bi];
506 
507  for (vcl_size_t bi=0;bi<block_size;bi++)
508  {
509 
510  // Compute 2-norm of k+bi-th column below the diagonal
511  norms[bi]=_nrm2(&block_cols[bi][bi],m-k-bi);
512 
513  if (norms[bi]!=ScalarType(0))
514  {
515  //pick the better of two reflectors, to 1 or -1,
516  //this is wierd syntax that also works with std::complex
517  if (std::abs(block_cols[bi][bi]-ScalarType(1))>std::abs(block_cols[bi][bi]+ScalarType(1)))
518  norms[bi]*=-1;
519  for (vcl_size_t i=bi;i<m-k;i++)
520  block_cols[bi][i]/=norms[bi];
521  block_cols[bi][bi]+=ScalarType(1);
522 
523  // Apply transformation to columns within the block
524  for (vcl_size_t j=bi+1; j<block_size; j++)
525  {
526  ScalarType s=_dotc(m-k-bi,&block_cols[bi][bi],&block_cols[j][bi]);
527  s = -s/block_cols[bi][bi];
528  _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s);
529  }
530  }
531  //temporarily store the diagonal value of R in betas
532  betas[k+bi]=-norms[bi];
533  }
534 
535  //copy the block back to row-major storage
536  for (vcl_size_t i=0;i<m-k;i++)
537  for (vcl_size_t bi=0;bi<block_size;bi++)
538  A[k+i][k+bi]=block_cols[bi][i];
539 
540  //apply transformations from block to remaining rows to the right of the block in parallel
541  #ifdef VIENNACL_OPENMP
542  #pragma omp parallel for
543  for (int section=0;section<(int)num_threads;section++)
544  #else
545  for (vcl_size_t section=0;section<num_threads;section++)
546  #endif
547  {
548  vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size;
549  vcl_size_t end =((n-k-block_size)*(section+1))/num_threads+k+block_size;
550  vcl_size_t length=end-start;
551  for (vcl_size_t bi=0;bi<block_size;bi++)
552  {
553  if (norms[bi]!=ScalarType(0))
554  {
555  for (vcl_size_t i=start;i<end;i++)
556  ss[i]=ScalarType(0);
557  for (vcl_size_t i=k+bi;i<m;i++)
558  _axpy(&A[i][start],ss+start,length,A[i][k+bi]);
559  for (vcl_size_t i=start;i<end;i++)
560  ss[i]=-ss[i]/A[k+bi][k+bi];
561  for (vcl_size_t i=k+bi;i<m;i++)
562  _axpy(ss+start,&A[i][start],length,A[i][k+bi]);
563  }
564  }
565  }
566  }
567 
568  //normalize the householder reflectors and store the betas
569  for (vcl_size_t j=0;j<std::min(m,n);j++)
570  {
571  ScalarType beta=A[j][j];
572  for (vcl_size_t i=j+1;i<m;i++)
573  A[i][j]/=beta;
574  A[j][j]=betas[j];//R diagonal values were stored temporarily in betas
575  betas[j]=beta;
576  }
577 
578  delete [] norms;
579  for (vcl_size_t i=0;i<block_size;i++)
580  delete [] block_cols[i];
581  delete [] block_cols;
582  delete [] ss;
583 
584  return betas;
585  }
586 
587  } //namespace host_based
588  } //namespace linalg
589 } //namespace viennacl
590 #endif
optimized BLAS functions using SSE2 and SSE3 intrinsic functions
std::vector< ScalarType > inplace_qr_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8, vcl_size_t num_threads=1)
Inplace qr factorization of an m x n dense row-major matrix, returning the householder normalization ...
void _axpy(const T *, T *, vcl_size_t, T)
Definition: sse_blas.hpp:73
std::vector< ScalarType > inplace_qr_col_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8)
Inplace qr factorization of an m x n dense column-major matrix, returning the householder normalizati...
void eliminateHermitian(ScalarType **A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType *ss)
Definition: sse_kernels.hpp:73
void reduceHermitianToBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads)
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
bool lu_factorize_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t *piv=NULL, vcl_size_t block_size=8)
Inplace lu factorization of an m x n dense row-major matrix with optional partial pivoting...
std::size_t vcl_size_t
Definition: forwards.h:74
T _nrm2(const T *, vcl_size_t)
Definition: sse_blas.hpp:117
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
T _dotc(vcl_size_t, const T *, const T *)
Definition: sse_blas.hpp:89
float ScalarType
Definition: fft_1d.cpp:42
bool isHermitian(ScalarType **const A, vcl_size_t n)
Definition: sse_kernels.hpp:50
vcl_size_t getHermitianBandwidth(ScalarType **const A, vcl_size_t n)
Definition: sse_kernels.hpp:61
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
void tridiagonalizeHermitianBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t bandwidth)
void inplace_tred2(ScalarType **A, vcl_size_t n, vcl_size_t block_size=1, vcl_size_t num_threads=1)
Inplace reduction of a dense n x n row-major or column-major hermitian (or real symmetric) matrix to ...